From e3be4f90095cb2e3973d4552abc13c180d7efcb0 Mon Sep 17 00:00:00 2001 From: Rich Bowen Date: Thu, 7 May 2026 09:45:59 -0400 Subject: [PATCH] Reduce false positives in privacy policy check Many projects host their own privacy policy page on their *.apache.org subdomain (e.g., beam.apache.org/privacy_policy, karaf.apache.org/privacy.html). These pages typically mirror or link to the canonical ASF privacy policy, but are currently flagged as non-compliant because the validation regex only accepts two exact canonical URLs. This change adds a third alternative that accepts any *.apache.org URL containing 'privac' (covering privacy, privacy-policy, privacypolicy, etc.). This eliminates 17 of 19 privacy warnings as false positives while still correctly rejecting links to non-ASF domains (e.g., policies.google.com). Also adds rspec tests for the privacy check. --- lib/spec/lib/sitestandards_spec.rb | 24 ++++++++++++++++++++++++ lib/whimsy/sitestandards.rb | 2 ++ 2 files changed, 26 insertions(+) diff --git a/lib/spec/lib/sitestandards_spec.rb b/lib/spec/lib/sitestandards_spec.rb index 5667a15d9c..393d3bc946 100644 --- a/lib/spec/lib/sitestandards_spec.rb +++ b/lib/spec/lib/sitestandards_spec.rb @@ -42,4 +42,28 @@ expect("/downloads.html").not_to match(valid) end end + + describe 'check for links to a privacy policy' do + valid = SiteStandards::COMMON_CHECKS['privacy'][SiteStandards::CHECK_VALIDATE] + it "should recognize the canonical privacy policy URL" do + expect("https://privacy.apache.org/policies/privacy-policy-public.html").to match(valid) + end + it "should recognize the legacy foundation privacy URL" do + expect("https://www.apache.org/foundation/policies/privacy.html").to match(valid) + expect("https://apache.org/foundation/policies/privacy.html").to match(valid) + end + it "should recognize project-level privacy pages on *.apache.org" do + expect("https://beam.apache.org/privacy_policy").to match(valid) + expect("https://karaf.apache.org/privacy.html").to match(valid) + expect("https://pig.apache.org/privacypolicy.html").to match(valid) + expect("https://systemds.apache.org/privacy-policy").to match(valid) + expect("https://hudi.apache.org/asf/privacy").to match(valid) + expect("https://cwiki.apache.org/confluence/display/KNOX/Privacy+Policy").to match(valid) + end + it "should reject non-ASF privacy pages" do + expect("https://policies.google.com/privacy").not_to match(valid) + expect("https://github.com/apache/privacy-website").not_to match(valid) + expect("https://example.com/privacy-policy.html").not_to match(valid) + end + end end diff --git a/lib/whimsy/sitestandards.rb b/lib/whimsy/sitestandards.rb index dcd833197f..2302b054c9 100644 --- a/lib/whimsy/sitestandards.rb +++ b/lib/whimsy/sitestandards.rb @@ -117,6 +117,8 @@ module SiteStandards CHECK_VALIDATE => %r{\Ahttps://privacy\.apache\.org/policies/privacy-policy-public\.html\z | \Ahttps?://(?:www\.)?apache\.org/foundation/policies/privacy\.html\z + | + \Ahttps?://[a-z0-9.-]*\.?apache\.org/.*privac[yie] }ix, CHECK_TYPE => 'href', CHECK_POLICY => 'https://www.apache.org/foundation/marks/pmcs.html#navigation',