From d5492cb2ad3376bfc8f6d0aac50bc80598948711 Mon Sep 17 00:00:00 2001 From: rzmk <30333942+rzmk@users.noreply.github.com> Date: Fri, 8 May 2026 16:20:06 -0400 Subject: [PATCH] feat: initial JSON-LD construction logic, add local test, improve schema --- .gitignore | 2 ++ Cargo.toml | 3 +++ README.md | 6 ++++++ src/jsonld.rs | 41 ++++++++++++++++++++++++++++---------- src/main.rs | 43 +++++++++++++++++++++++++++++++--------- src/schema.rs | 41 +++++++++++++++++++++++++++++++++----- tests/validate_jsonld.rs | 42 +++++++++++++++++++++++++++++++++++++-- 7 files changed, 151 insertions(+), 27 deletions(-) diff --git a/.gitignore b/.gitignore index ea8c4bf..65bb74b 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ /target +# For local tests +/tests/*.jsonl diff --git a/Cargo.toml b/Cargo.toml index 9b08f04..1221147 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,3 +9,6 @@ ckanaction = "0.2.0" jsonschema = "0.46.4" serde_json = "1.0.149" tokio = { version = "1.52.1", features = ["full"] } + +[features] +local = [] diff --git a/README.md b/README.md index 1920b8b..70a940a 100644 --- a/README.md +++ b/README.md @@ -25,3 +25,9 @@ To include print statements in test output, run: ```bash cargo test -- --nocapture ``` + +If you have the local dump files setup available you can run those tests with: + +```bash +cargo test -F local -- --nocapture +``` diff --git a/src/jsonld.rs b/src/jsonld.rs index ed29457..2efa904 100644 --- a/src/jsonld.rs +++ b/src/jsonld.rs @@ -1,13 +1,32 @@ -use crate::schema::get_dataset_schema; +use serde_json::json; -pub fn construct_dataset_jsonld_from_metadata(metadata: serde_json::Value) -> serde_json::Value { - todo!() -} - -pub fn validate_dataset_jsonld(jsonld: serde_json::Value) -> bool { - if let Ok(_) = jsonschema::validate(&get_dataset_schema(), &jsonld) { - true - } else { - false - } +pub fn construct_dataset_jsonld_from_metadata( + dataset_metadata: serde_json::Value, +) -> serde_json::Value { + let dataset_id = dataset_metadata.get("id").unwrap().as_str().unwrap(); + let dataset_name = dataset_metadata.get("name").unwrap().as_str().unwrap(); + let organization_name = dataset_metadata + .get("organization") + .unwrap() + .get("title") + .unwrap(); + // TODO: Align and include Geoconnex PIDs for reference feature categories to extract PIDs from them + // Then also convert spatial_full FeatureCollection to Multipolygon if needed for gsp:hasGeometry when there are + // also non-reference feature polygons + // if let Some(spatial_full) = dataset_metadata.get("spatial_full") {} + let jsonld = json!({ + "@context": { + "@vocab": "https://schema.org/", + "gsp": "http://www.opengis.net/ont/geosparql#", + }, + "@type": "Dataset", + // TODO: Customize namespace based on CKAN instance being used + "@id": format!("https://geoconnex.us/nmwdh/ckan-datasets/{dataset_id}"), + "name": dataset_name, + "provider": { + "@type": "Organization", + "name": organization_name + } + }); + jsonld } diff --git a/src/main.rs b/src/main.rs index 7a25204..686a6fc 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,5 +1,7 @@ use anyhow::{Result, bail}; -use ckan_geoconnex_bulk_runner::jsonld::construct_dataset_jsonld_from_metadata; +use ckan_geoconnex_bulk_runner::{ + jsonld::construct_dataset_jsonld_from_metadata, schema::get_dataset_schema, +}; // TODO: Ensure error output is only streamed to stderr as per Geoconnex docs @@ -42,21 +44,44 @@ async fn main() -> Result<()> { for dataset_name in dataset_names { // 1. Get the dataset's metadata with /package_show by using the dataset name as the id // TODO: Identify if dataset names are unique - let dataset_metadata = ckan + let package_show_response = ckan .package_show() .id(dataset_name.as_str().unwrap().to_string()) .call() .await?; - println!("{dataset_metadata:#?}"); - // 2. Construct JSON-LD based on the data from /package_show - let jsonld = construct_dataset_jsonld_from_metadata(dataset_metadata); - println!("{jsonld:#?}"); - // 3. Validate the JSON-LD against the dataset JSON schema - // 4. Print the JSON-LD on a new line to stdout + let Some(success) = package_show_response.get("success") else { + bail!( + "CKAN API did not return success key in /package_show response for dataset {dataset_name}. Full response: {response}" + ); + }; + if success.as_bool().unwrap() { + let Some(dataset_metadata) = package_show_response.get("result") else { + bail!( + "CKAN API did not return result object in /package_show response for dataset {dataset_name}. Full response: {response}" + ); + }; + // 2. Construct JSON-LD based on the data from /package_show + let jsonld = + construct_dataset_jsonld_from_metadata(dataset_metadata.to_owned()); + // 3. Validate the JSON-LD against the dataset JSON schema + if jsonschema::validate(&get_dataset_schema(), &jsonld).is_ok() { + // 4. Print the JSON-LD on a new line to stdout + println!("{jsonld}"); + } else { + eprintln!("JSON-LD for {dataset_name} is not valid."); + eprintln!("{jsonld}"); + } + } else { + bail!( + "CKAN API returned {{\"success\": false\"}} for /package_show endpoint on dataset {dataset_name}. Full response: {response}" + ); + } } } } else { - bail!("CKAN API returned {{\"success\": false\"}}. Full response: {response}"); + bail!( + "CKAN API returned {{\"success\": false\"}} for /package_list endpoint. Full response: {response}" + ); } offset = offset + limit; } diff --git a/src/schema.rs b/src/schema.rs index ccec0e6..a4267f0 100644 --- a/src/schema.rs +++ b/src/schema.rs @@ -1,17 +1,25 @@ use serde_json::json; pub fn get_dataset_schema() -> serde_json::Value { - json!({ + // Allow for "local" feature + #[allow(unused_mut)] + let mut dataset_schema = json!({ "type": "object", "properties": { "@context": {"type": ["string", "object"]}, - "@type": {"type": ["string", "array"], "contains": {"const": "Dataset"}}, + "@type": {"type": ["string", "array"], "contains": {"anyOf": [{"const": "Dataset"}, {"const": "schema:Dataset"}]}}, "@id": {"type": "string"}, "name": {"type": "string"}, + "schema:name": {"type": "string"}, "provider": { "type": "object", "properties": {"@type": {"type": "string"}, "name": {"type": "string"}}, }, + "schema:provider": { + "type": "object", + "properties": {"@type": {"type": "string"}, "name": {"type": "string"}}, + }, + "gsp:hasGeometry": {"@type": "object"}, "about": { "type": ["string", "array"], "items": { @@ -21,8 +29,28 @@ pub fn get_dataset_schema() -> serde_json::Value { "minItems": 1 }, }, - "required": ["@context", "@type", "@id", "name", "provider", "about"] - }) + "anyOf": [ + { "required": ["@context", "@type", "@id", "name", "provider", "about"] }, + { "required": ["@context", "@type", "@id", "name", "provider", "gsp:hasGeometry"] }, + { "required": ["@context", "@type", "@id", "schema:name", "schema:provider", "about"] }, + { "required": ["@context", "@type", "@id", "schema:name", "schema:provider", "gsp:hasGeometry"] }, + // { "required": ["@context", "@type", "@id", "name", "provider"] } + ] + }); + // Some JSON-LD for datasets (e.g. sciencebase) do not have about or gsp:hasGeometry yet are still valid as per SHACL shape + #[cfg(feature = "local")] + { + let required_array = dataset_schema + .get_mut("anyOf") + .unwrap() + .as_array_mut() + .unwrap(); + required_array.insert( + required_array.len(), + json!({ "required": ["@context", "@type", "@id", "schema:name", "schema:provider"] }), + ); + } + dataset_schema } pub fn get_location_schema() -> serde_json::Value { @@ -40,6 +68,9 @@ pub fn get_location_schema() -> serde_json::Value { "geo": {"type": "object"}, "gsp:hasGeometry": {"type": "object"} }, - "required": ["@context", "@type", "@id", "name", "provider", "geo", "gsp:hasGeometry"] + "anyOf": [ + { "required": ["@context", "@type", "@id", "name", "provider", "geo", "gsp:hasGeometry"] }, + { "required": ["@context", "@type", "@id", "schema:name", "schema:provider", "geo", "gsp:hasGeometry"] }, + ] }) } diff --git a/tests/validate_jsonld.rs b/tests/validate_jsonld.rs index 596af2d..1a80672 100644 --- a/tests/validate_jsonld.rs +++ b/tests/validate_jsonld.rs @@ -1,6 +1,44 @@ use anyhow::{Result, bail}; +#[cfg(feature = "local")] +use ckan_geoconnex_bulk_runner::schema::get_dataset_schema; use ckan_geoconnex_bulk_runner::schema::get_location_schema; use serde_json::json; +#[cfg(feature = "local")] +use std::{ + fs::File, + io::{BufRead, BufReader}, +}; + +#[test] +#[cfg(feature = "local")] +fn validate_sciencebase_dump() -> Result<()> { + let file_path = "./tests/sciencebase_jsonld_dump_202605-06.jsonl"; + if !std::fs::exists(file_path)? { + bail!("File path {file_path} does not exist.") + } + + let dataset_json_schema = get_dataset_schema(); + + // Read JSONL file line-by-line + let file = File::open(file_path)?; + let reader = BufReader::new(file); + + let mut line_number = 0; + for line in reader.lines() { + let jsonld: serde_json::Value = serde_json::from_str(line?.as_str())?; + if let Err(e) = jsonschema::validate(&dataset_json_schema, &jsonld) { + println!("Error during validation on line {line_number}:"); + println!("JSON-LD:"); + println!("{jsonld:#?}"); + bail!("{e}"); + } else { + println!("Successfully validated line {line_number}."); + line_number = line_number + 1; + } + } + + Ok(()) +} #[test] fn validate_usgs_location_jsonld() -> Result<()> { @@ -46,9 +84,9 @@ fn validate_usgs_location_jsonld() -> Result<()> { } }); - let dataset_json_schema = get_location_schema(); + let location_json_schema = get_location_schema(); - if let Err(e) = jsonschema::validate(&dataset_json_schema, &usgs_location_jsonld) { + if let Err(e) = jsonschema::validate(&location_json_schema, &usgs_location_jsonld) { println!("Error during validation:"); bail!("{e}"); } else {