diff --git a/.github/workflows/container.yml b/.github/workflows/container.yml new file mode 100644 index 0000000..459686d --- /dev/null +++ b/.github/workflows/container.yml @@ -0,0 +1,33 @@ +name: Publish bulk loader Docker container + +on: + workflow_dispatch: + +jobs: + build_and_push: + runs-on: ubuntu-latest + strategy: + matrix: + namespace: [New_Mexico_Water_Data_Catalog] + steps: + - name: Login to GitHub Container Registry + uses: docker/login-action@v4 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v4 + + - name: Build and push + uses: docker/build-push-action@v7 + with: + file: ./bulk_loader/Dockerfile + push: true + tags: ghcr.io/dathere/ckan_geoconnex_bulk_runner:${{ matrix.namespace }} + cache-from: type=gha,scope=ckan_geoconnex_bulk_runner + platforms: linux/amd64 + cache-to: type=gha,mode=max,scope=ckan_geoconnex_bulk_runner + build-args: | + NAMESPACE=${{ matrix.namespace }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 3b91bcb..39b972d 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -2,11 +2,15 @@ name: Publish CKAN-Geoconnex JSONL file as latest release on: release: types: [published] + workflow_dispatch: permissions: # To upload to releases contents: write jobs: publish: + strategy: + matrix: + ckan_instance: [{ name: New_Mexico_Water_Data_Catalog, url: https://catalog.newmexicowaterdata.org, token: NMWDC_API_BULK_LOADER_TOKEN }] name: Publish JSONL file runs-on: ubuntu-latest steps: @@ -20,8 +24,10 @@ jobs: - name: Run generate_release crate and upload JSONL file run: | cd ${{github.workspace}} - cargo run -p generate_release --release --verbose > ckan-geoconnex-web-resources.jsonl - gh release upload ${{github.event.release.tag_name}} ckan-geoconnex-web-resources.jsonl + cargo run -p generate_release --release --verbose > ${{ matrix.ckan_instance.name }}.jsonl + gh release upload ${{github.event.release.tag_name}} ${{ matrix.ckan_instance.name }}.jsonl env: GITHUB_TOKEN: ${{ github.TOKEN }} - NMWDC_API_BULK_LOADER_TOKEN: ${{ secrets.NMWDC_API_BULK_LOADER_TOKEN }} + NAMESPACE: ${{ matrix.ckan_instance.name }} + INSTANCE_URL: ${{ matrix.ckan_instance.url }} + API_TOKEN: ${{ secrets[matrix.ckan_instance.token] }} diff --git a/bulk_loader/Dockerfile b/bulk_loader/Dockerfile index 9b049ec..ebe410d 100644 --- a/bulk_loader/Dockerfile +++ b/bulk_loader/Dockerfile @@ -2,8 +2,12 @@ FROM rust:1.96 AS builder WORKDIR /app RUN rustup set profile minimal COPY . . -RUN cargo build --release +ARG NAMESPACE +ENV NAMESPACE=$NAMESPACE +RUN cargo build -p bulk_loader --release FROM ubuntu:latest +RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates +RUN update-ca-certificates COPY --from=builder /app/target/release/bulk_loader / ENTRYPOINT ["/bulk_loader"] diff --git a/bulk_loader/src/main.rs b/bulk_loader/src/main.rs index d84216f..d36133f 100644 --- a/bulk_loader/src/main.rs +++ b/bulk_loader/src/main.rs @@ -2,9 +2,11 @@ use anyhow::Result; #[tokio::main] async fn main() -> Result<()> { + // Get the CKAN instance's Geoconnex namespace to filter for its JSON-LD data + let namespace = std::env!("NAMESPACE"); // Get latest release data which is organized as a single JSONL file // at https://github.com/dathere/ckan_geoconnex_bulk_runner/releases/latest - let body = reqwest::get("https://github.com/dathere/ckan_geoconnex_bulk_runner/releases/latest/download/ckan-geoconnex-web-resources.jsonl") + let body = reqwest::get(format!("https://github.com/dathere/ckan_geoconnex_bulk_runner/releases/latest/download/{namespace}.jsonl")) .await? .text() .await?; diff --git a/ckan_geoconnex_bulk_runner_py/src/lib.rs b/ckan_geoconnex_bulk_runner_py/src/lib.rs index 76a45d4..26c537f 100644 --- a/ckan_geoconnex_bulk_runner_py/src/lib.rs +++ b/ckan_geoconnex_bulk_runner_py/src/lib.rs @@ -6,25 +6,25 @@ use pyo3::prelude::*; mod ckan_geoconnex_bulk_runner_py { use pyo3::{exceptions::PyException, prelude::*}; - #[pyfunction] - /// Construct Geoconnex-compatible JSON-LD as a string from dataset metadata. - /// - /// Input: Dataset metadata (output of /package_show for a CKAN dataset) as a string. - /// Output: Constructed Geoconnex-compatible JSON-LD as a string. - fn construct_dataset_jsonld_from_metadata(dataset_metadata: String) -> PyResult { - match serde_json::to_value(dataset_metadata) { - Ok(dataset_json) => { - match geoconnex_utils::jsonld::construct_dataset_jsonld_from_metadata(dataset_json) - { - Ok(jsonld) => serde_json::to_string(&jsonld).map_err(|e| { - PyException::new_err(format!( - "Error when converting JSON-LD to string: {e}" - )) - }), - Err(e) => Err(PyException::new_err(e.to_string())), - } - } - Err(e) => Err(PyException::new_err(e.to_string())), - } - } + // #[pyfunction] + // Construct Geoconnex-compatible JSON-LD as a string from dataset metadata. + // + // Input: Dataset metadata (output of /package_show for a CKAN dataset) as a string. + // Output: Constructed Geoconnex-compatible JSON-LD as a string. + // fn construct_dataset_jsonld_from_metadata(dataset_metadata: String) -> PyResult { + // match serde_json::to_value(dataset_metadata) { + // Ok(dataset_json) => { + // match geoconnex_utils::jsonld::construct_dataset_jsonld_from_metadata(dataset_json) + // { + // Ok(jsonld) => serde_json::to_string(&jsonld).map_err(|e| { + // PyException::new_err(format!( + // "Error when converting JSON-LD to string: {e}" + // )) + // }), + // Err(e) => Err(PyException::new_err(e.to_string())), + // } + // } + // Err(e) => Err(PyException::new_err(e.to_string())), + // } + // } } diff --git a/generate_release/src/main.rs b/generate_release/src/main.rs index db2abf8..944d17e 100644 --- a/generate_release/src/main.rs +++ b/generate_release/src/main.rs @@ -4,15 +4,14 @@ use std::collections::HashMap; #[tokio::main] async fn main() -> Result<()> { - // Identify required header data - let Ok(nmwdc_token) = std::env::var("NMWDC_API_BULK_LOADER_TOKEN") else { - bail!("Could not find environment variable NMWDC_API_BULK_LOADER_TOKEN."); - }; + let namespace = env!("NAMESPACE"); + let token = env!("API_TOKEN"); + let instance_url = env!("INSTANCE_URL"); let mut headers = HashMap::new(); - headers.insert("x-geoconnex-runner".to_string(), nmwdc_token); + headers.insert("x-geoconnex-runner".to_string(), token.to_string()); let ckan = ckanaction::CKAN::builder() - .url("https://catalog.newmexicowaterdata.org") + .url(instance_url) .headers(headers) .build(); @@ -68,6 +67,8 @@ async fn main() -> Result<()> { // 2. Construct JSON-LD based on the data from /package_show let jsonld = match construct_dataset_jsonld_from_metadata( dataset_metadata.to_owned(), + instance_url.to_string(), + namespace.to_string(), ) { Ok(j) => j, Err(e) => { diff --git a/geoconnex_utils/src/jsonld.rs b/geoconnex_utils/src/jsonld.rs index 0cbddb9..f28429b 100644 --- a/geoconnex_utils/src/jsonld.rs +++ b/geoconnex_utils/src/jsonld.rs @@ -3,6 +3,8 @@ use serde_json::json; pub fn construct_dataset_jsonld_from_metadata( dataset_metadata: serde_json::Value, + instance_url: String, + namespace: String, ) -> Result { let dataset_id = dataset_metadata.get("id").unwrap().as_str().unwrap(); eprintln!("Attempting to construct JSON-LD for dataset {dataset_id}"); @@ -60,14 +62,14 @@ pub fn construct_dataset_jsonld_from_metadata( }, "@type": "Dataset", // TODO: Customize namespace based on CKAN instance being used - "@id": format!("https://geoconnex.us/ckan/sandbox/{dataset_id}"), + "@id": format!("https://geoconnex.us/ckan/{namespace}/{dataset_id}"), "name": dataset_title, "provider": { "@type": "Organization", "name": organization_name }, // TODO: Customize CKAN instance URL based on CKAN instance being used - "url": format!("https://sandbox.opendataportal.us/dataset/{dataset_id}") + "url": format!("{instance_url}/dataset/{dataset_id}") }); let jsonld_map = jsonld.as_object_mut().unwrap(); if about.len() > 0 {