feat: initial JSON-LD construction logic, add local test, improve schema

This commit is contained in:
rzmk 2026-05-08 16:20:06 -04:00
parent 3497994681
commit d5492cb2ad
7 changed files with 151 additions and 27 deletions

2
.gitignore vendored
View file

@ -1 +1,3 @@
/target /target
# For local tests
/tests/*.jsonl

View file

@ -9,3 +9,6 @@ ckanaction = "0.2.0"
jsonschema = "0.46.4" jsonschema = "0.46.4"
serde_json = "1.0.149" serde_json = "1.0.149"
tokio = { version = "1.52.1", features = ["full"] } tokio = { version = "1.52.1", features = ["full"] }
[features]
local = []

View file

@ -25,3 +25,9 @@ To include print statements in test output, run:
```bash ```bash
cargo test -- --nocapture cargo test -- --nocapture
``` ```
If you have the local dump files setup available you can run those tests with:
```bash
cargo test -F local -- --nocapture
```

View file

@ -1,13 +1,32 @@
use crate::schema::get_dataset_schema; use serde_json::json;
pub fn construct_dataset_jsonld_from_metadata(metadata: serde_json::Value) -> serde_json::Value { pub fn construct_dataset_jsonld_from_metadata(
todo!() dataset_metadata: serde_json::Value,
} ) -> serde_json::Value {
let dataset_id = dataset_metadata.get("id").unwrap().as_str().unwrap();
pub fn validate_dataset_jsonld(jsonld: serde_json::Value) -> bool { let dataset_name = dataset_metadata.get("name").unwrap().as_str().unwrap();
if let Ok(_) = jsonschema::validate(&get_dataset_schema(), &jsonld) { let organization_name = dataset_metadata
true .get("organization")
} else { .unwrap()
false .get("title")
.unwrap();
// TODO: Align and include Geoconnex PIDs for reference feature categories to extract PIDs from them
// Then also convert spatial_full FeatureCollection to Multipolygon if needed for gsp:hasGeometry when there are
// also non-reference feature polygons
// if let Some(spatial_full) = dataset_metadata.get("spatial_full") {}
let jsonld = json!({
"@context": {
"@vocab": "https://schema.org/",
"gsp": "http://www.opengis.net/ont/geosparql#",
},
"@type": "Dataset",
// TODO: Customize namespace based on CKAN instance being used
"@id": format!("https://geoconnex.us/nmwdh/ckan-datasets/{dataset_id}"),
"name": dataset_name,
"provider": {
"@type": "Organization",
"name": organization_name
} }
});
jsonld
} }

View file

@ -1,5 +1,7 @@
use anyhow::{Result, bail}; use anyhow::{Result, bail};
use ckan_geoconnex_bulk_runner::jsonld::construct_dataset_jsonld_from_metadata; use ckan_geoconnex_bulk_runner::{
jsonld::construct_dataset_jsonld_from_metadata, schema::get_dataset_schema,
};
// TODO: Ensure error output is only streamed to stderr as per Geoconnex docs // TODO: Ensure error output is only streamed to stderr as per Geoconnex docs
@ -42,21 +44,44 @@ async fn main() -> Result<()> {
for dataset_name in dataset_names { for dataset_name in dataset_names {
// 1. Get the dataset's metadata with /package_show by using the dataset name as the id // 1. Get the dataset's metadata with /package_show by using the dataset name as the id
// TODO: Identify if dataset names are unique // TODO: Identify if dataset names are unique
let dataset_metadata = ckan let package_show_response = ckan
.package_show() .package_show()
.id(dataset_name.as_str().unwrap().to_string()) .id(dataset_name.as_str().unwrap().to_string())
.call() .call()
.await?; .await?;
println!("{dataset_metadata:#?}"); let Some(success) = package_show_response.get("success") else {
bail!(
"CKAN API did not return success key in /package_show response for dataset {dataset_name}. Full response: {response}"
);
};
if success.as_bool().unwrap() {
let Some(dataset_metadata) = package_show_response.get("result") else {
bail!(
"CKAN API did not return result object in /package_show response for dataset {dataset_name}. Full response: {response}"
);
};
// 2. Construct JSON-LD based on the data from /package_show // 2. Construct JSON-LD based on the data from /package_show
let jsonld = construct_dataset_jsonld_from_metadata(dataset_metadata); let jsonld =
println!("{jsonld:#?}"); construct_dataset_jsonld_from_metadata(dataset_metadata.to_owned());
// 3. Validate the JSON-LD against the dataset JSON schema // 3. Validate the JSON-LD against the dataset JSON schema
if jsonschema::validate(&get_dataset_schema(), &jsonld).is_ok() {
// 4. Print the JSON-LD on a new line to stdout // 4. Print the JSON-LD on a new line to stdout
println!("{jsonld}");
} else {
eprintln!("JSON-LD for {dataset_name} is not valid.");
eprintln!("{jsonld}");
}
} else {
bail!(
"CKAN API returned {{\"success\": false\"}} for /package_show endpoint on dataset {dataset_name}. Full response: {response}"
);
}
} }
} }
} else { } else {
bail!("CKAN API returned {{\"success\": false\"}}. Full response: {response}"); bail!(
"CKAN API returned {{\"success\": false\"}} for /package_list endpoint. Full response: {response}"
);
} }
offset = offset + limit; offset = offset + limit;
} }

View file

@ -1,17 +1,25 @@
use serde_json::json; use serde_json::json;
pub fn get_dataset_schema() -> serde_json::Value { pub fn get_dataset_schema() -> serde_json::Value {
json!({ // Allow for "local" feature
#[allow(unused_mut)]
let mut dataset_schema = json!({
"type": "object", "type": "object",
"properties": { "properties": {
"@context": {"type": ["string", "object"]}, "@context": {"type": ["string", "object"]},
"@type": {"type": ["string", "array"], "contains": {"const": "Dataset"}}, "@type": {"type": ["string", "array"], "contains": {"anyOf": [{"const": "Dataset"}, {"const": "schema:Dataset"}]}},
"@id": {"type": "string"}, "@id": {"type": "string"},
"name": {"type": "string"}, "name": {"type": "string"},
"schema:name": {"type": "string"},
"provider": { "provider": {
"type": "object", "type": "object",
"properties": {"@type": {"type": "string"}, "name": {"type": "string"}}, "properties": {"@type": {"type": "string"}, "name": {"type": "string"}},
}, },
"schema:provider": {
"type": "object",
"properties": {"@type": {"type": "string"}, "name": {"type": "string"}},
},
"gsp:hasGeometry": {"@type": "object"},
"about": { "about": {
"type": ["string", "array"], "type": ["string", "array"],
"items": { "items": {
@ -21,8 +29,28 @@ pub fn get_dataset_schema() -> serde_json::Value {
"minItems": 1 "minItems": 1
}, },
}, },
"required": ["@context", "@type", "@id", "name", "provider", "about"] "anyOf": [
}) { "required": ["@context", "@type", "@id", "name", "provider", "about"] },
{ "required": ["@context", "@type", "@id", "name", "provider", "gsp:hasGeometry"] },
{ "required": ["@context", "@type", "@id", "schema:name", "schema:provider", "about"] },
{ "required": ["@context", "@type", "@id", "schema:name", "schema:provider", "gsp:hasGeometry"] },
// { "required": ["@context", "@type", "@id", "name", "provider"] }
]
});
// Some JSON-LD for datasets (e.g. sciencebase) do not have about or gsp:hasGeometry yet are still valid as per SHACL shape
#[cfg(feature = "local")]
{
let required_array = dataset_schema
.get_mut("anyOf")
.unwrap()
.as_array_mut()
.unwrap();
required_array.insert(
required_array.len(),
json!({ "required": ["@context", "@type", "@id", "schema:name", "schema:provider"] }),
);
}
dataset_schema
} }
pub fn get_location_schema() -> serde_json::Value { pub fn get_location_schema() -> serde_json::Value {
@ -40,6 +68,9 @@ pub fn get_location_schema() -> serde_json::Value {
"geo": {"type": "object"}, "geo": {"type": "object"},
"gsp:hasGeometry": {"type": "object"} "gsp:hasGeometry": {"type": "object"}
}, },
"required": ["@context", "@type", "@id", "name", "provider", "geo", "gsp:hasGeometry"] "anyOf": [
{ "required": ["@context", "@type", "@id", "name", "provider", "geo", "gsp:hasGeometry"] },
{ "required": ["@context", "@type", "@id", "schema:name", "schema:provider", "geo", "gsp:hasGeometry"] },
]
}) })
} }

View file

@ -1,6 +1,44 @@
use anyhow::{Result, bail}; use anyhow::{Result, bail};
#[cfg(feature = "local")]
use ckan_geoconnex_bulk_runner::schema::get_dataset_schema;
use ckan_geoconnex_bulk_runner::schema::get_location_schema; use ckan_geoconnex_bulk_runner::schema::get_location_schema;
use serde_json::json; use serde_json::json;
#[cfg(feature = "local")]
use std::{
fs::File,
io::{BufRead, BufReader},
};
#[test]
#[cfg(feature = "local")]
fn validate_sciencebase_dump() -> Result<()> {
let file_path = "./tests/sciencebase_jsonld_dump_202605-06.jsonl";
if !std::fs::exists(file_path)? {
bail!("File path {file_path} does not exist.")
}
let dataset_json_schema = get_dataset_schema();
// Read JSONL file line-by-line
let file = File::open(file_path)?;
let reader = BufReader::new(file);
let mut line_number = 0;
for line in reader.lines() {
let jsonld: serde_json::Value = serde_json::from_str(line?.as_str())?;
if let Err(e) = jsonschema::validate(&dataset_json_schema, &jsonld) {
println!("Error during validation on line {line_number}:");
println!("JSON-LD:");
println!("{jsonld:#?}");
bail!("{e}");
} else {
println!("Successfully validated line {line_number}.");
line_number = line_number + 1;
}
}
Ok(())
}
#[test] #[test]
fn validate_usgs_location_jsonld() -> Result<()> { fn validate_usgs_location_jsonld() -> Result<()> {
@ -46,9 +84,9 @@ fn validate_usgs_location_jsonld() -> Result<()> {
} }
}); });
let dataset_json_schema = get_location_schema(); let location_json_schema = get_location_schema();
if let Err(e) = jsonschema::validate(&dataset_json_schema, &usgs_location_jsonld) { if let Err(e) = jsonschema::validate(&location_json_schema, &usgs_location_jsonld) {
println!("Error during validation:"); println!("Error during validation:");
bail!("{e}"); bail!("{e}");
} else { } else {