Compare commits

...

15 commits
0.0.0 ... main

Author SHA1 Message Date
rzmk
e009754ba7 ci: fix namespace matrix value 2026-06-23 14:29:33 -04:00
rzmk
4067cdf382 build: include ENV from ARG 2026-06-23 14:24:32 -04:00
rzmk
566ea8cbc9 ci: fix build-args syntax 2026-06-23 14:21:24 -04:00
rzmk
1a82620420 ci: add build-args 2026-06-23 14:20:23 -04:00
rzmk
4f4c576d97 ci: set tag to namespace 2026-06-23 14:16:04 -04:00
rzmk
eb714aae81 fix: JSONL names 2026-06-23 14:13:44 -04:00
rzmk
c6de5d6690 ci: use secret outside of matrix 2026-06-23 14:01:31 -04:00
rzmk
0c4d9488f3 ci: fix syntax 2026-06-23 13:44:19 -04:00
rzmk
7c8fac233e feat: multi-CKAN-instance compatibility 2026-06-23 13:38:24 -04:00
rzmk
59564c9de3 build: include ca-certificates in container 2026-06-15 13:02:51 -04:00
rzmk
9da7d042de build: only build bulk_loader crate 2026-06-15 11:49:55 -04:00
rzmk
f17b3a7700 ci: use default Git context 2026-06-15 11:48:13 -04:00
rzmk
c29108a278 ci: fix file parameter 2026-06-15 11:42:06 -04:00
rzmk
235863b84c ci: add container publish workflow 2026-06-15 11:40:00 -04:00
rzmk
c45ecd0fa9 fix: use nmwdh for JSON-LD construction 2026-06-15 11:31:15 -04:00
7 changed files with 82 additions and 34 deletions

33
.github/workflows/container.yml vendored Normal file
View file

@ -0,0 +1,33 @@
name: Publish bulk loader Docker container
on:
workflow_dispatch:
jobs:
build_and_push:
runs-on: ubuntu-latest
strategy:
matrix:
namespace: [New_Mexico_Water_Data_Catalog]
steps:
- name: Login to GitHub Container Registry
uses: docker/login-action@v4
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v4
- name: Build and push
uses: docker/build-push-action@v7
with:
file: ./bulk_loader/Dockerfile
push: true
tags: ghcr.io/dathere/ckan_geoconnex_bulk_runner:${{ matrix.namespace }}
cache-from: type=gha,scope=ckan_geoconnex_bulk_runner
platforms: linux/amd64
cache-to: type=gha,mode=max,scope=ckan_geoconnex_bulk_runner
build-args: |
NAMESPACE=${{ matrix.namespace }}

View file

@ -2,11 +2,15 @@ name: Publish CKAN-Geoconnex JSONL file as latest release
on: on:
release: release:
types: [published] types: [published]
workflow_dispatch:
permissions: permissions:
# To upload to releases # To upload to releases
contents: write contents: write
jobs: jobs:
publish: publish:
strategy:
matrix:
ckan_instance: [{ name: New_Mexico_Water_Data_Catalog, url: https://catalog.newmexicowaterdata.org, token: NMWDC_API_BULK_LOADER_TOKEN }]
name: Publish JSONL file name: Publish JSONL file
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
@ -20,8 +24,10 @@ jobs:
- name: Run generate_release crate and upload JSONL file - name: Run generate_release crate and upload JSONL file
run: | run: |
cd ${{github.workspace}} cd ${{github.workspace}}
cargo run -p generate_release --release --verbose > ckan-geoconnex-web-resources.jsonl cargo run -p generate_release --release --verbose > ${{ matrix.ckan_instance.name }}.jsonl
gh release upload ${{github.event.release.tag_name}} ckan-geoconnex-web-resources.jsonl gh release upload ${{github.event.release.tag_name}} ${{ matrix.ckan_instance.name }}.jsonl
env: env:
GITHUB_TOKEN: ${{ github.TOKEN }} GITHUB_TOKEN: ${{ github.TOKEN }}
NMWDC_API_BULK_LOADER_TOKEN: ${{ secrets.NMWDC_API_BULK_LOADER_TOKEN }} NAMESPACE: ${{ matrix.ckan_instance.name }}
INSTANCE_URL: ${{ matrix.ckan_instance.url }}
API_TOKEN: ${{ secrets[matrix.ckan_instance.token] }}

View file

@ -2,8 +2,12 @@ FROM rust:1.96 AS builder
WORKDIR /app WORKDIR /app
RUN rustup set profile minimal RUN rustup set profile minimal
COPY . . COPY . .
RUN cargo build --release ARG NAMESPACE
ENV NAMESPACE=$NAMESPACE
RUN cargo build -p bulk_loader --release
FROM ubuntu:latest FROM ubuntu:latest
RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates
RUN update-ca-certificates
COPY --from=builder /app/target/release/bulk_loader / COPY --from=builder /app/target/release/bulk_loader /
ENTRYPOINT ["/bulk_loader"] ENTRYPOINT ["/bulk_loader"]

View file

@ -2,9 +2,11 @@ use anyhow::Result;
#[tokio::main] #[tokio::main]
async fn main() -> Result<()> { async fn main() -> Result<()> {
// Get the CKAN instance's Geoconnex namespace to filter for its JSON-LD data
let namespace = std::env!("NAMESPACE");
// Get latest release data which is organized as a single JSONL file // Get latest release data which is organized as a single JSONL file
// at https://github.com/dathere/ckan_geoconnex_bulk_runner/releases/latest // at https://github.com/dathere/ckan_geoconnex_bulk_runner/releases/latest
let body = reqwest::get("https://github.com/dathere/ckan_geoconnex_bulk_runner/releases/latest/download/ckan-geoconnex-web-resources.jsonl") let body = reqwest::get(format!("https://github.com/dathere/ckan_geoconnex_bulk_runner/releases/latest/download/{namespace}.jsonl"))
.await? .await?
.text() .text()
.await?; .await?;

View file

@ -6,25 +6,25 @@ use pyo3::prelude::*;
mod ckan_geoconnex_bulk_runner_py { mod ckan_geoconnex_bulk_runner_py {
use pyo3::{exceptions::PyException, prelude::*}; use pyo3::{exceptions::PyException, prelude::*};
#[pyfunction] // #[pyfunction]
/// Construct Geoconnex-compatible JSON-LD as a string from dataset metadata. // Construct Geoconnex-compatible JSON-LD as a string from dataset metadata.
/// //
/// Input: Dataset metadata (output of /package_show for a CKAN dataset) as a string. // Input: Dataset metadata (output of /package_show for a CKAN dataset) as a string.
/// Output: Constructed Geoconnex-compatible JSON-LD as a string. // Output: Constructed Geoconnex-compatible JSON-LD as a string.
fn construct_dataset_jsonld_from_metadata(dataset_metadata: String) -> PyResult<String> { // fn construct_dataset_jsonld_from_metadata(dataset_metadata: String) -> PyResult<String> {
match serde_json::to_value(dataset_metadata) { // match serde_json::to_value(dataset_metadata) {
Ok(dataset_json) => { // Ok(dataset_json) => {
match geoconnex_utils::jsonld::construct_dataset_jsonld_from_metadata(dataset_json) // match geoconnex_utils::jsonld::construct_dataset_jsonld_from_metadata(dataset_json)
{ // {
Ok(jsonld) => serde_json::to_string(&jsonld).map_err(|e| { // Ok(jsonld) => serde_json::to_string(&jsonld).map_err(|e| {
PyException::new_err(format!( // PyException::new_err(format!(
"Error when converting JSON-LD to string: {e}" // "Error when converting JSON-LD to string: {e}"
)) // ))
}), // }),
Err(e) => Err(PyException::new_err(e.to_string())), // Err(e) => Err(PyException::new_err(e.to_string())),
} // }
} // }
Err(e) => Err(PyException::new_err(e.to_string())), // Err(e) => Err(PyException::new_err(e.to_string())),
} // }
} // }
} }

View file

@ -4,15 +4,14 @@ use std::collections::HashMap;
#[tokio::main] #[tokio::main]
async fn main() -> Result<()> { async fn main() -> Result<()> {
// Identify required header data let namespace = env!("NAMESPACE");
let Ok(nmwdc_token) = std::env::var("NMWDC_API_BULK_LOADER_TOKEN") else { let token = env!("API_TOKEN");
bail!("Could not find environment variable NMWDC_API_BULK_LOADER_TOKEN."); let instance_url = env!("INSTANCE_URL");
};
let mut headers = HashMap::new(); let mut headers = HashMap::new();
headers.insert("x-geoconnex-runner".to_string(), nmwdc_token); headers.insert("x-geoconnex-runner".to_string(), token.to_string());
let ckan = ckanaction::CKAN::builder() let ckan = ckanaction::CKAN::builder()
.url("https://catalog.newmexicowaterdata.org") .url(instance_url)
.headers(headers) .headers(headers)
.build(); .build();
@ -68,6 +67,8 @@ async fn main() -> Result<()> {
// 2. Construct JSON-LD based on the data from /package_show // 2. Construct JSON-LD based on the data from /package_show
let jsonld = match construct_dataset_jsonld_from_metadata( let jsonld = match construct_dataset_jsonld_from_metadata(
dataset_metadata.to_owned(), dataset_metadata.to_owned(),
instance_url.to_string(),
namespace.to_string(),
) { ) {
Ok(j) => j, Ok(j) => j,
Err(e) => { Err(e) => {

View file

@ -3,6 +3,8 @@ use serde_json::json;
pub fn construct_dataset_jsonld_from_metadata( pub fn construct_dataset_jsonld_from_metadata(
dataset_metadata: serde_json::Value, dataset_metadata: serde_json::Value,
instance_url: String,
namespace: String,
) -> Result<serde_json::Value> { ) -> Result<serde_json::Value> {
let dataset_id = dataset_metadata.get("id").unwrap().as_str().unwrap(); let dataset_id = dataset_metadata.get("id").unwrap().as_str().unwrap();
eprintln!("Attempting to construct JSON-LD for dataset {dataset_id}"); eprintln!("Attempting to construct JSON-LD for dataset {dataset_id}");
@ -60,14 +62,14 @@ pub fn construct_dataset_jsonld_from_metadata(
}, },
"@type": "Dataset", "@type": "Dataset",
// TODO: Customize namespace based on CKAN instance being used // TODO: Customize namespace based on CKAN instance being used
"@id": format!("https://geoconnex.us/ckan/sandbox/{dataset_id}"), "@id": format!("https://geoconnex.us/ckan/{namespace}/{dataset_id}"),
"name": dataset_title, "name": dataset_title,
"provider": { "provider": {
"@type": "Organization", "@type": "Organization",
"name": organization_name "name": organization_name
}, },
// TODO: Customize CKAN instance URL based on CKAN instance being used // TODO: Customize CKAN instance URL based on CKAN instance being used
"url": format!("https://sandbox.opendataportal.us/dataset/{dataset_id}") "url": format!("{instance_url}/dataset/{dataset_id}")
}); });
let jsonld_map = jsonld.as_object_mut().unwrap(); let jsonld_map = jsonld.as_object_mut().unwrap();
if about.len() > 0 { if about.len() > 0 {