diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..19c06ba --- /dev/null +++ b/.dockerignore @@ -0,0 +1,13 @@ +# Keep the build context small and never leak internal docs or local build output into image layers. +.git +.github +dev +**/target +*.iml +.idea +.vscode +*.log +# Engine test-resource output dirs (generated by the test suite; never needed for a build). +engine/src/test/resources/test_integration_output +engine/src/test/resources/test_sample_output +engine/src/test/resources/test_integration_queries diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml new file mode 100644 index 0000000..1f595ac --- /dev/null +++ b/.github/workflows/docker-publish.yml @@ -0,0 +1,56 @@ +name: docker-publish + +# Build the server image on every PR (validation, no push) and publish to GHCR on pushes to main +# and version tags. The image is the self-host distribution channel: `docker run` the gateway next +# to your CSVs and point an MCP client at it. +on: + push: + branches: [ main ] + tags: [ 'v*' ] + pull_request: + branches: [ main ] + workflow_dispatch: + +permissions: + contents: read + packages: write + +jobs: + build-push: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to GHCR + # Only needed to push; skip on PRs (incl. forks, where GITHUB_TOKEN is read-only). + if: github.event_name != 'pull_request' + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Image metadata + id: meta + uses: docker/metadata-action@v5 + with: + images: ghcr.io/${{ github.repository_owner }}/cuckoodb + tags: | + type=ref,event=branch + type=ref,event=tag + type=sha + type=raw,value=latest,enable={{is_default_branch}} + + - name: Build and push + uses: docker/build-push-action@v6 + with: + context: . + # PRs validate the build only; pushes to main/tags publish. + push: ${{ github.event_name != 'pull_request' }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..f8e07b7 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,27 @@ +# syntax=docker/dockerfile:1 + +# Build the server fat jar (and its engine dependency) inside the image, so the build is +# self-contained and reproducible. A BuildKit cache mount keeps the Maven repo warm across +# rebuilds without baking it into a layer. Tests are CI's job; skip them here for a fast image. +FROM eclipse-temurin:17-jdk AS build +WORKDIR /src +COPY . . +RUN --mount=type=cache,target=/root/.m2 \ + chmod +x mvnw && ./mvnw -pl server -am clean package -Dmaven.test.skip=true -q + +# Runtime: JRE only, non-root, just the executable Spring Boot jar. +FROM eclipse-temurin:17-jre AS runtime +RUN groupadd --system cuckoo && useradd --system --gid cuckoo --home /app cuckoo +WORKDIR /app +COPY --from=build /src/server/target/cuckoodb-server-*.jar /app/cuckoodb-server.jar + +# Self-host layout: mount a folder of CSVs at /cuckoodb/data and they are loaded into the catalog +# at startup; uploads (when enabled) persist under /cuckoodb/work. data-dir is the PARENT of the +# data/ folder the engine scans, hence /cuckoodb with CSVs in /cuckoodb/data. +RUN mkdir -p /cuckoodb/data /cuckoodb/work && chown -R cuckoo:cuckoo /cuckoodb +ENV CUCKOODB_DATA_DIR=/cuckoodb \ + CUCKOODB_WORK_DIR=/cuckoodb/work + +USER cuckoo +EXPOSE 8080 +ENTRYPOINT ["java", "-jar", "/app/cuckoodb-server.jar"] diff --git a/README.md b/README.md index 2d74fdb..0cd763a 100644 --- a/README.md +++ b/README.md @@ -97,6 +97,21 @@ java -cp engine/target/cuckoodb-engine-1.0.0-jar-with-dependencies.jar \ Both `--max-tuples` and `--timeout-ms` are optional and independent. Omit either to impose no limit on that dimension. +### Run the server as a container + +The Spring Boot gateway (REST + MCP) ships as a container image, so you can run it next to your data with no Java toolchain. Put your CSV files in a folder and mount it as the catalog's data directory: + +```bash +docker run --rm -p 8080:8080 \ + -v /path/to/your/csvs:/cuckoodb/data \ + ghcr.io/jinba1/cuckoodb:latest +``` + +- **REST:** `POST http://localhost:8080/queries`, `GET /tables`, `GET /tables/{name}` (OpenAPI at `/swagger-ui.html`). +- **MCP:** Streamable-HTTP endpoint at `http://localhost:8080/mcp` — point an MCP client at it to query your CSVs with `list_tables` / `describe_table` / `sample_rows` / `explain_query` / `query`. + +The image is published to GHCR on each merge to `main`. To build it locally instead: `docker build -t cuckoodb .` + ### Query budgets The engine enforces **total-work semantics**: every tuple emitted by any operator in the tree counts against the budget, including intermediate tuples that are later filtered or joined. A cross-product explosion that never produces output rows will still hit the tuple limit. The timeout clock starts lazily at the first tuple emission.