From 05d7cb5dce5cb0bf68c7011a550ba8e462f145d3 Mon Sep 17 00:00:00 2001 From: David Smiley Date: Mon, 29 Jun 2026 12:11:49 -0400 Subject: [PATCH 1/2] Redo Apache RAT. * Upgrade from 0.15 to 0.18 * Use a config file * Remove custom license patterns that match nothing anymore * Ensure works even without a git working tree (e.g. smoketester source release) * Add a canary to assert that header detection is actually working each run * Synchronize RatTask executions to avoid shared Ant task state issues under parallel Gradle execution * Consolidate all exclude patterns into a single include-everything-then-exclude approach --- gradle.lockfile | 31 ++- gradle/libs.versions.toml | 3 +- gradle/validation/rat-config.xml | 63 +++++ gradle/validation/rat-sources.gradle | 252 ++++++++---------- .../validate-source-patterns.gradle | 42 +-- solr/ui/proguard.pro | 2 + 6 files changed, 215 insertions(+), 178 deletions(-) create mode 100644 gradle/validation/rat-config.xml diff --git a/gradle.lockfile b/gradle.lockfile index 5ffaf058021c..57c414826d43 100644 --- a/gradle.lockfile +++ b/gradle.lockfile @@ -2,16 +2,29 @@ # Manual edits can break the build and are not advised. # This file is expected to be part of source control. # To regenerate this file, run: ./gradlew :dependencies --write-locks -commons-cli:commons-cli:1.5.0=ratDeps -commons-io:commons-io:2.11.0=ratDeps +com.github.albfernandez:juniversalchardet:2.5.0=ratDeps +com.github.cliftonlabs:json-simple:3.0.2=ratDeps +commons-beanutils:commons-beanutils:1.9.4=ratDeps +commons-codec:commons-codec:1.19.0=ratDeps +commons-collections:commons-collections:3.2.2=ratDeps +commons-io:commons-io:2.21.0=ratDeps junit:junit:3.8.1=javacc net.java.dev.javacc:javacc:7.0.13=javacc -org.apache.commons:commons-collections4:4.4=ratDeps -org.apache.commons:commons-compress:1.21=ratDeps -org.apache.commons:commons-lang3:3.12.0=ratDeps -org.apache.rat:apache-rat-api:0.15=ratDeps -org.apache.rat:apache-rat-core:0.15=ratDeps -org.apache.rat:apache-rat-tasks:0.15=ratDeps -org.apache.rat:apache-rat:0.15=ratDeps +org.apache.commons:commons-collections4:4.5.0=ratDeps +org.apache.commons:commons-compress:1.28.0=ratDeps +org.apache.commons:commons-csv:1.14.1=ratDeps +org.apache.commons:commons-digester3:3.2=ratDeps +org.apache.commons:commons-lang3:3.20.0=ratDeps +org.apache.commons:commons-text:1.15.0=ratDeps +org.apache.rat:apache-rat-core:0.18=ratDeps +org.apache.rat:apache-rat-tasks:0.18=ratDeps +org.apache.rat:apache-rat-tools:0.18=ratDeps +org.apache.rat:apache-rat:0.18=ratDeps +org.apache.tika:tika-core:3.2.3=ratDeps +org.apache.tika:tika-parser-text-module:3.2.3=ratDeps +org.apache.velocity.tools:velocity-tools-generic:3.1=ratDeps +org.apache.velocity:velocity-engine-core:2.4.1=ratDeps org.eclipse.jdt:ecj:3.39.0=ecjDeps +org.slf4j:slf4j-api:2.0.17=ratDeps +org.slf4j:slf4j-simple:2.0.17=ratDeps empty= diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index afb242b3b1f5..f647c85d4580 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -41,7 +41,7 @@ apache-kafka = "3.9.2" apache-log4j = "2.26.0" apache-lucene = "10.4.0" apache-opennlp = "2.5.9" -apache-rat = "0.15" +apache-rat = "0.18" apache-tika = "3.3.0" apache-tomcat = "6.0.53" apache-zookeeper = "3.9.5" @@ -508,6 +508,7 @@ semver4j-semver4j = { module = "org.semver4j:semver4j", version.ref = "semver4j" slf4j-api = { module = "org.slf4j:slf4j-api", version.ref = "slf4j" } slf4j-jcloverslf4j = { module = "org.slf4j:jcl-over-slf4j", version.ref = "slf4j" } slf4j-jultoslf4j = { module = "org.slf4j:jul-to-slf4j", version.ref = "slf4j" } +slf4j-simple = { module = "org.slf4j:slf4j-simple", version.ref = "slf4j" } spotbugs-annotations = { module = "com.github.spotbugs:spotbugs-annotations", version.ref = "spotbugs" } squareup-okhttp3-mockwebserver = { module = "com.squareup.okhttp3:mockwebserver" } squareup-okhttp3-okhttp = { module = "com.squareup.okhttp3:okhttp", version.ref = "squareup-okhttp3-okhttp" } diff --git a/gradle/validation/rat-config.xml b/gradle/validation/rat-config.xml new file mode 100644 index 000000000000..8a70ce7affd4 --- /dev/null +++ b/gradle/validation/rat-config.xml @@ -0,0 +1,63 @@ + + + + + + + + + + + + + + + + All advertising materials + + + + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS + + + + + + ; License: MIT + + + + + + + + + + + + + + + + + diff --git a/gradle/validation/rat-sources.gradle b/gradle/validation/rat-sources.gradle index 6f1f4f1441ee..77645fa1e265 100644 --- a/gradle/validation/rat-sources.gradle +++ b/gradle/validation/rat-sources.gradle @@ -31,11 +31,15 @@ buildscript { // Configure rat dependencies for use in the custom task. configure(rootProject) { configurations { - ratDeps + ratDeps { + exclude group: 'commons-logging', module: 'commons-logging' + exclude group: 'commons-cli', module: 'commons-cli' + } } dependencies { ratDeps libs.apache.rat.rat + ratDeps libs.slf4j.simple } } @@ -104,62 +108,73 @@ allprojects { description = 'Runs Apache Rat checks.' ratClasspath.from(rootProject.configurations.ratDeps) + ratConfig.set(rootProject.layout.projectDirectory.file("gradle/validation/rat-config.xml")) def defaultScanFileTree = project.fileTree(projectDir, { - // Only check files tracked by git — skip untracked/gitignored files - // (IDE artifacts, AI assistant configs, etc.) def trackedFiles = gitTrackedFiles(projectDir) if (trackedFiles != null) { - exclude { element -> - // Allow directories through (they are just containers), exclude untracked files - !element.isDirectory() && !trackedFiles.contains(element.relativePath.pathString) - } + // Git checkout: only scan files in the git index — naturally excludes build + // output, gitignored files, and untracked files without explicit patterns. + include { f -> f.isDirectory() || trackedFiles.contains(f.relativePath.pathString) } + + // The following are excluded from the source release, thus no header needed + exclude "dev-docs" + exclude "**/AGENTS.md" + exclude "**/.*" // e.g. .gradle, .github, .junie, .gitignore, etc. + } else { + // Source release (no git): scan everything, exclude generated/ignored paths manually. + include "**" } - // Exclude the build directory — even though it's not git-tracked, Gradle - // validates input/output overlaps at configuration time before the git - // filter closure runs, so this must be excluded explicitly. exclude project.layout.buildDirectory.get().asFile.name - // similarly, at least one subproject writes to .gradle/ for config/setup - exclude ".gradle/**" // Don't check any of the subprojects - they have their own rat tasks. - exclude subprojects.collect {it.projectDir.name} - - // The git index filter above excludes untracked files. These include - // patterns select the file types that should carry license headers. - include "**/*.xml" - include "**/*.md" - include "**/*.py" - include "**/*.sh" - include "**/*.bat" - include "**/*.gradle" - - // Include selected patterns from any source folders. We could make this - // relative to source sets but it seems to be of little value - all our source sets - // live under 'src' anyway. - include "src/**" - exclude "src/**/*.png" - exclude "src/**/*.txt" - exclude "src/**/*.zip" - exclude "src/**/*.properties" - exclude "src/**/*.utf8" - exclude "src/**/*.svg" - exclude "src/**/*.csv" - - // TODO: SOLR-15601: Some of these should carry the license, perhaps? - exclude "**/*.html" + exclude subprojects.collect { + projectDir.toPath().relativize(it.projectDir.toPath()).toString().replace('\\', '/') + } + + // Binary and media types — no text-based headers possible. + exclude "**/*.png" + exclude "**/*.jpg" + exclude "**/*.gif" + exclude "**/*.svg" + exclude "**/*.ico" + exclude "**/*.bin" + exclude "**/*.pem" + exclude "**/*.pdf" + + // Structured-data and config files that intentionally carry no license. exclude "**/*.json" + exclude "**/*.jsonl" + exclude "**/*.csv" + + exclude "**/*gradle.lockfile" - // Conditionally apply module-specific patterns. We do it here instead - // of reconfiguring each project because the provider can be made lazy - // and it's easier to manage this way. + // TODO: SOLR-15601 ? + exclude "**/*.properties" + exclude "**/*.txt" + + // TODO: SOLR-15601: HTML should carry the license, perhaps? + exclude "**/*.html" + + // Conditionally apply module-specific patterns. switch (project.path) { case ":": - include "gradlew" - include "gradlew.bat" - // Exclude github stuff (templates, workflows). - exclude ".github" + // Composite included builds: skip build artifacts, src is already covered by "**". + exclude "build-tools/*/build" + // Non-subproject build output. + exclude "buildSrc/build" + exclude "dev-tools/solr-missing-doclet/build" + // IDE project files. + exclude "gradle/ide/eclipse" + exclude "gradle/validation/ecj-lint/ecj.javadocs.prefs" + // Generated/vendored build-tool resources. + exclude "gradle/documentation" + exclude "gradle/wrapper" + // rat-config.xml contains license-text data in its body, which confuses RAT's own detection. + exclude "gradle/validation/rat-config.xml" + // Kotlin JS dependency store. + exclude "kotlin-js-store" exclude "dev-tools/scripts/cloud.sh" exclude "dev-tools/scripts/README.md" @@ -167,21 +182,25 @@ allprojects { // Exclude new CHANGELOG and version-summary for each release exclude "CHANGELOG.md" - exclude "changelog/**/version-summary.md" - exclude "changelog/.templates/*.md" - exclude "changelog/archive.md" - - // The root project also includes patterns for the include composite - // projects. Include their sources in the scan. - include "build-tools/build-infra/src/**" - include "build-tools/missing-doclet/src/**" + exclude "changelog" + break + + case ":solr": + // solr/ui is a registered subproject with its own rat task, but it may be disabled + exclude "ui" + + exclude "licenses" + break + + case ":solr:benchmark": + // this was generated, not hand-authored + exclude "jfr-profile.jfc" break case ":solr:modules:clustering": exclude "src/test-files/META-INF/services/*" break - case ":solr:modules:ltr": // TODO: SOLR-15601: Some of these should carry the license, perhaps? exclude "**/*.py" @@ -214,6 +233,11 @@ allprojects { exclude "**/*.xml" exclude "**/*.sh" exclude "**/*.bat" + exclude "**/*.cmd" + exclude "**/*.js" + exclude "modules" + // zoo.cfg intentionally has no header and serves as the RAT detection canary. + ratCanaryPath.set("/solr/zoo.cfg") break case ":solr:webapp": @@ -221,8 +245,6 @@ allprojects { break case ":solr:solr-ref-guide": - include "*.yml" - include "**/*.adoc" exclude "ui-src/**" break @@ -232,6 +254,8 @@ allprojects { case ":solr:example": exclude "**/*.xml" + exclude "**/*.cfg" + exclude "**/*.js" exclude "films/README.md" break @@ -260,102 +284,52 @@ class RatTask extends DefaultTask { @InputFiles final ConfigurableFileCollection ratClasspath = project.objects.fileCollection() - def generateReport(File reportFile) { - // Set up ant rat task. + @InputFile + final RegularFileProperty ratConfig = project.objects.fileProperty() + + /** Path (as reported by RAT, with leading /) of a file with no header used to verify detection. */ + @Input + @Optional + final Property ratCanaryPath = project.objects.property(String) + + @TaskAction + def execute() { + // RAT's Ant task has shared static state; synchronize to prevent parallel-execution interference. + synchronized (RatTask.class) { doRat() } + } + + private void doRat() { ant.taskdef(resource: 'org/apache/rat/anttasks/antlib.xml', classpath: ratClasspath.asPath) - // Collect all output files for debugging. - String inputFileList = inputFileTrees.get().collectMany {fileTree -> - fileTree.asList() - }.sort().join("\n") - project.file(reportFile.path.replaceAll('.xml$', '-filelist.txt')).setText(inputFileList, - "UTF-8") + File reportFile = xmlReport.get().asFile + reportFile.parentFile.mkdirs() - // Run rat via ant. - ant.report(format: 'xml', reportFile: reportFile, addDefaultLicenseMatchers: true) { + ant.report(outputStyle: 'xml', outputFile: reportFile.absolutePath) { // Pass all gradle file trees to the ant task (Gradle's internal adapters are used). inputFileTrees.get().each {fileTree -> fileTree.addToAntBuilder(ant, 'resources', FileCollection.AntType.ResourceCollection) } - - // BSD 4-clause stuff (is disallowed below) - substringMatcher(licenseFamilyCategory: "BSD4 ", - licenseFamilyName: "Original BSD License (with advertising clause)") { - pattern(substring: "All advertising materials") - } - - // BSD-like stuff - substringMatcher(licenseFamilyCategory: "BSD ", licenseFamilyName: "Modified BSD License") { - // brics automaton - pattern(substring: "Copyright (c) 2001-2009 Anders Moeller") - // snowball - pattern(substring: "Copyright (c) 2001, Dr Martin Porter") - // UMASS kstem - pattern(substring: "THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF MASSACHUSETTS AND OTHER CONTRIBUTORS") - // Egothor - pattern(substring: "Egothor Software License version 1.00") - // JaSpell - pattern(substring: "Copyright (c) 2005 Bruno Martins") - // d3.js - pattern(substring: "THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS") - // highlight.js - pattern(substring: "THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS") - } - - // MIT-like - substringMatcher(licenseFamilyCategory: "MIT ", licenseFamilyName: "Modified BSD License") { - // ICU license - pattern(substring: "Permission is hereby granted, free of charge, to any person obtaining a copy") - // ui-grid - pattern(substring: " ; License: MIT") - } - - // Apache - substringMatcher(licenseFamilyCategory: "AL ", licenseFamilyName: "Apache") { - pattern(substring: "Licensed to the Apache Software Foundation (ASF) under") - // this is the old - school one under some files - pattern(substring: 'Licensed under the Apache License, Version 2.0 (the "License")') - } - - substringMatcher(licenseFamilyCategory: "GEN ", licenseFamilyName: "Generated") { - // svg files generated by gnuplot - pattern(substring: "Produced by GNUPLOT") - // snowball stemmers generated by snowball compiler - pattern(substring: "Generated by Snowball") - // parsers generated by antlr - pattern(substring: "ANTLR GENERATED CODE") + config { + fileset(file: ratConfig.get().asFile.absolutePath) } - - approvedLicense(familyName: "Apache") - approvedLicense(familyName: "The MIT License") - approvedLicense(familyName: "Modified BSD License") - approvedLicense(familyName: "Generated") } - } - def printUnknownFiles(File reportFile) { def ratXml = new XmlSlurper().parse(reportFile) - def errors = [] - ratXml.resource.each {resource -> - if (resource.'license-approval'.@name.text() == "false") { - errors << "Unknown license: ${resource.@name.text()}" - } - } - def checkProp = "validation.rat.failOnError" - project.failOrWarn(checkProp, "Detected license header issues", errors) - } - - @TaskAction - def execute() { - def origEncoding = System.getProperty("file.encoding") - try { - File reportFile = xmlReport.get().asFile - generateReport(reportFile) - printUnknownFiles(reportFile) - } finally { - if (System.getProperty("file.encoding") != origEncoding) { - throw new GradleException("Something is wrong: Apache RAT changed file.encoding to ${System.getProperty('file.encoding')}?") + def violations = ratXml.resource + .findAll { it.license.any { lic -> lic.@approval.text() == 'false' } } + .collect { resource -> + def licName = resource.license.find { lic -> lic.@approval.text() == 'false' }.@name.text() + "${resource.@name.text()} ${licName}" + } + if (ratCanaryPath.present) { + def canary = ratCanaryPath.get() + def hit = violations.find { it.startsWith(canary) } + if (hit == null) { + throw new GradleException("RAT self-test failed: '${canary}' was not flagged — is license detection broken?") } + violations.remove(hit) } + violations.each { logger.error(it.toString()) } + project.failOrWarn("validation.rat.failOnError", "Detected license header issues", violations) } } diff --git a/gradle/validation/validate-source-patterns.gradle b/gradle/validation/validate-source-patterns.gradle index aeaca6c97e9a..f88c789b2286 100644 --- a/gradle/validation/validate-source-patterns.gradle +++ b/gradle/validation/validate-source-patterns.gradle @@ -15,24 +15,10 @@ * limitations under the License. */ -import org.apache.rat.Defaults -import org.apache.rat.document.impl.FileDocument -import org.apache.rat.api.MetaData - import javax.inject.Inject; import org.gradle.internal.logging.progress.ProgressLoggerFactory import org.gradle.internal.logging.progress.ProgressLogger -buildscript { - repositories { - mavenCentral() - } - - dependencies { - classpath libs.apache.rat.rat - } -} - def extensions = [ 'adoc', 'bat', @@ -171,8 +157,6 @@ class ValidateSourcePatternsTask extends DefaultTask { def javadocsPattern = ~$/(?sm)^\Q/**\E(.*?)\Q*/\E/$; def javaCommentPattern = ~$/(?sm)^\Q/*\E(.*?)\Q*/\E/$; def xmlCommentPattern = ~$/(?sm)\Q\E/$; - def lineSplitter = ~$/[\r\n]+/$; - def licenseMatcher = Defaults.createDefaultMatcher(); def validLoggerPattern = ~$/(?s)\b(private\s|static\s|final\s){3}+\s*Logger\s+\p{javaJavaIdentifierStart}+\s+=\s+\QLoggerFactory.getLogger(MethodHandles.lookup().lookupClass());\E/$; def validLoggerNamePattern = ~$/(?s)\b(private\s|static\s|final\s){3}+\s*Logger\s+log+\s+=\s+\QLoggerFactory.getLogger(MethodHandles.lookup().lookupClass());\E/$; def packagePattern = ~$/(?m)^\s*package\s+org\.apache.*;/$; @@ -180,23 +164,26 @@ class ValidateSourcePatternsTask extends DefaultTask { def extendsLuceneTestCasePattern = ~$/public.*?class.*?extends.*?LuceneTestCase[^\n]*?\n/$; def validSPINameJavadocTag = ~$/(?s)\s*\*\s*@lucene\.spi\s+\{@value #NAME\}/$; - def isLicense = {matcher, ratDocument -> - licenseMatcher.reset() - return lineSplitter.split(matcher.group(1)).any {licenseMatcher.match(ratDocument, it)} + // Detects whether a comment block contains Apache license header text. + def isLicense = { matcher -> + def content = matcher.group(1) + content.contains('Licensed to the Apache Software Foundation') || + content.contains('Licensed under the Apache License') || + content.contains('http://www.apache.org/licenses/LICENSE-2.0') || + content.contains('https://www.apache.org/licenses/LICENSE-2.0') } def checkLicenseHeaderPrecedes = {f, description, contentPattern, commentPattern, - text, - ratDocument -> + text -> def contentMatcher = contentPattern.matcher(text); if (contentMatcher.find()) { def contentStartPos = contentMatcher.start(); def commentMatcher = commentPattern.matcher(text); while (commentMatcher.find()) { - if (isLicense(commentMatcher, ratDocument)) { + if (isLicense(commentMatcher)) { if (commentMatcher.start() < contentStartPos) { break; // This file is all good, so break loop: license header precedes 'description' definition @@ -228,11 +215,9 @@ class ValidateSourcePatternsTask extends DefaultTask { } } def javadocsMatcher = javadocsPattern.matcher(text); - def ratDocument = new FileDocument(f); while (javadocsMatcher.find()) { - if (isLicense(javadocsMatcher, ratDocument)) { - reportViolation(f, String.format(Locale.ENGLISH, 'javadoc-style license header [%s]', - ratDocument.getMetaData().value(MetaData.RAT_URL_LICENSE_FAMILY_NAME))); + if (isLicense(javadocsMatcher)) { + reportViolation(f, 'javadoc-style license header'); } } if (f.name.endsWith('.java')) { @@ -269,8 +254,7 @@ class ValidateSourcePatternsTask extends DefaultTask { 'package', packagePattern, javaCommentPattern, - text, - ratDocument); + text); if (f.name.contains("Test")) { checkMockitoAssume(f, text); } @@ -289,7 +273,7 @@ class ValidateSourcePatternsTask extends DefaultTask { } } if (f.name.endsWith('.xml')) { - checkLicenseHeaderPrecedes(f, '', xmlTagPattern, xmlCommentPattern, text, ratDocument); + checkLicenseHeaderPrecedes(f, '', xmlTagPattern, xmlCommentPattern, text); } } progress.completed() diff --git a/solr/ui/proguard.pro b/solr/ui/proguard.pro index c3a4b01f3f06..120b297213e6 100644 --- a/solr/ui/proguard.pro +++ b/solr/ui/proguard.pro @@ -1,3 +1,5 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements; and to You under the Apache License, Version 2.0. + -keep class org.apache.solr.ui.domain.** { *; } -keep class org.apache.solr.ui.*.data.** { *; } -keep class org.apache.solr.ui.*.domain.** { *; } From 7f40e42d80ee634585c1ec14f0371f18faa85a55 Mon Sep 17 00:00:00 2001 From: David Smiley Date: Fri, 3 Jul 2026 09:46:59 -0400 Subject: [PATCH 2/2] Exclude .gradle --- gradle/validation/rat-sources.gradle | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gradle/validation/rat-sources.gradle b/gradle/validation/rat-sources.gradle index 77645fa1e265..b47f4399ddd6 100644 --- a/gradle/validation/rat-sources.gradle +++ b/gradle/validation/rat-sources.gradle @@ -120,10 +120,12 @@ allprojects { // The following are excluded from the source release, thus no header needed exclude "dev-docs" exclude "**/AGENTS.md" - exclude "**/.*" // e.g. .gradle, .github, .junie, .gitignore, etc. + exclude "**/.*" // e.g. .gradle, .github, .gitignore, etc. } else { // Source release (no git): scan everything, exclude generated/ignored paths manually. include "**" + + exclude ".gradle" // gradle will create this dir } exclude project.layout.buildDirectory.get().asFile.name