mirror of
https://github.com/dathere/ckan_geoconnex_bulk_runner.git
synced 2026-07-05 15:12:20 +00:00
feat: initial JSON-LD construction logic, add local test, improve schema
This commit is contained in:
parent
3497994681
commit
d5492cb2ad
7 changed files with 151 additions and 27 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
|
@ -1 +1,3 @@
|
|||
/target
|
||||
# For local tests
|
||||
/tests/*.jsonl
|
||||
|
|
|
|||
|
|
@ -9,3 +9,6 @@ ckanaction = "0.2.0"
|
|||
jsonschema = "0.46.4"
|
||||
serde_json = "1.0.149"
|
||||
tokio = { version = "1.52.1", features = ["full"] }
|
||||
|
||||
[features]
|
||||
local = []
|
||||
|
|
|
|||
|
|
@ -25,3 +25,9 @@ To include print statements in test output, run:
|
|||
```bash
|
||||
cargo test -- --nocapture
|
||||
```
|
||||
|
||||
If you have the local dump files setup available you can run those tests with:
|
||||
|
||||
```bash
|
||||
cargo test -F local -- --nocapture
|
||||
```
|
||||
|
|
|
|||
|
|
@ -1,13 +1,32 @@
|
|||
use crate::schema::get_dataset_schema;
|
||||
use serde_json::json;
|
||||
|
||||
pub fn construct_dataset_jsonld_from_metadata(metadata: serde_json::Value) -> serde_json::Value {
|
||||
todo!()
|
||||
}
|
||||
|
||||
pub fn validate_dataset_jsonld(jsonld: serde_json::Value) -> bool {
|
||||
if let Ok(_) = jsonschema::validate(&get_dataset_schema(), &jsonld) {
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
pub fn construct_dataset_jsonld_from_metadata(
|
||||
dataset_metadata: serde_json::Value,
|
||||
) -> serde_json::Value {
|
||||
let dataset_id = dataset_metadata.get("id").unwrap().as_str().unwrap();
|
||||
let dataset_name = dataset_metadata.get("name").unwrap().as_str().unwrap();
|
||||
let organization_name = dataset_metadata
|
||||
.get("organization")
|
||||
.unwrap()
|
||||
.get("title")
|
||||
.unwrap();
|
||||
// TODO: Align and include Geoconnex PIDs for reference feature categories to extract PIDs from them
|
||||
// Then also convert spatial_full FeatureCollection to Multipolygon if needed for gsp:hasGeometry when there are
|
||||
// also non-reference feature polygons
|
||||
// if let Some(spatial_full) = dataset_metadata.get("spatial_full") {}
|
||||
let jsonld = json!({
|
||||
"@context": {
|
||||
"@vocab": "https://schema.org/",
|
||||
"gsp": "http://www.opengis.net/ont/geosparql#",
|
||||
},
|
||||
"@type": "Dataset",
|
||||
// TODO: Customize namespace based on CKAN instance being used
|
||||
"@id": format!("https://geoconnex.us/nmwdh/ckan-datasets/{dataset_id}"),
|
||||
"name": dataset_name,
|
||||
"provider": {
|
||||
"@type": "Organization",
|
||||
"name": organization_name
|
||||
}
|
||||
});
|
||||
jsonld
|
||||
}
|
||||
|
|
|
|||
43
src/main.rs
43
src/main.rs
|
|
@ -1,5 +1,7 @@
|
|||
use anyhow::{Result, bail};
|
||||
use ckan_geoconnex_bulk_runner::jsonld::construct_dataset_jsonld_from_metadata;
|
||||
use ckan_geoconnex_bulk_runner::{
|
||||
jsonld::construct_dataset_jsonld_from_metadata, schema::get_dataset_schema,
|
||||
};
|
||||
|
||||
// TODO: Ensure error output is only streamed to stderr as per Geoconnex docs
|
||||
|
||||
|
|
@ -42,21 +44,44 @@ async fn main() -> Result<()> {
|
|||
for dataset_name in dataset_names {
|
||||
// 1. Get the dataset's metadata with /package_show by using the dataset name as the id
|
||||
// TODO: Identify if dataset names are unique
|
||||
let dataset_metadata = ckan
|
||||
let package_show_response = ckan
|
||||
.package_show()
|
||||
.id(dataset_name.as_str().unwrap().to_string())
|
||||
.call()
|
||||
.await?;
|
||||
println!("{dataset_metadata:#?}");
|
||||
// 2. Construct JSON-LD based on the data from /package_show
|
||||
let jsonld = construct_dataset_jsonld_from_metadata(dataset_metadata);
|
||||
println!("{jsonld:#?}");
|
||||
// 3. Validate the JSON-LD against the dataset JSON schema
|
||||
// 4. Print the JSON-LD on a new line to stdout
|
||||
let Some(success) = package_show_response.get("success") else {
|
||||
bail!(
|
||||
"CKAN API did not return success key in /package_show response for dataset {dataset_name}. Full response: {response}"
|
||||
);
|
||||
};
|
||||
if success.as_bool().unwrap() {
|
||||
let Some(dataset_metadata) = package_show_response.get("result") else {
|
||||
bail!(
|
||||
"CKAN API did not return result object in /package_show response for dataset {dataset_name}. Full response: {response}"
|
||||
);
|
||||
};
|
||||
// 2. Construct JSON-LD based on the data from /package_show
|
||||
let jsonld =
|
||||
construct_dataset_jsonld_from_metadata(dataset_metadata.to_owned());
|
||||
// 3. Validate the JSON-LD against the dataset JSON schema
|
||||
if jsonschema::validate(&get_dataset_schema(), &jsonld).is_ok() {
|
||||
// 4. Print the JSON-LD on a new line to stdout
|
||||
println!("{jsonld}");
|
||||
} else {
|
||||
eprintln!("JSON-LD for {dataset_name} is not valid.");
|
||||
eprintln!("{jsonld}");
|
||||
}
|
||||
} else {
|
||||
bail!(
|
||||
"CKAN API returned {{\"success\": false\"}} for /package_show endpoint on dataset {dataset_name}. Full response: {response}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
bail!("CKAN API returned {{\"success\": false\"}}. Full response: {response}");
|
||||
bail!(
|
||||
"CKAN API returned {{\"success\": false\"}} for /package_list endpoint. Full response: {response}"
|
||||
);
|
||||
}
|
||||
offset = offset + limit;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,17 +1,25 @@
|
|||
use serde_json::json;
|
||||
|
||||
pub fn get_dataset_schema() -> serde_json::Value {
|
||||
json!({
|
||||
// Allow for "local" feature
|
||||
#[allow(unused_mut)]
|
||||
let mut dataset_schema = json!({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"@context": {"type": ["string", "object"]},
|
||||
"@type": {"type": ["string", "array"], "contains": {"const": "Dataset"}},
|
||||
"@type": {"type": ["string", "array"], "contains": {"anyOf": [{"const": "Dataset"}, {"const": "schema:Dataset"}]}},
|
||||
"@id": {"type": "string"},
|
||||
"name": {"type": "string"},
|
||||
"schema:name": {"type": "string"},
|
||||
"provider": {
|
||||
"type": "object",
|
||||
"properties": {"@type": {"type": "string"}, "name": {"type": "string"}},
|
||||
},
|
||||
"schema:provider": {
|
||||
"type": "object",
|
||||
"properties": {"@type": {"type": "string"}, "name": {"type": "string"}},
|
||||
},
|
||||
"gsp:hasGeometry": {"@type": "object"},
|
||||
"about": {
|
||||
"type": ["string", "array"],
|
||||
"items": {
|
||||
|
|
@ -21,8 +29,28 @@ pub fn get_dataset_schema() -> serde_json::Value {
|
|||
"minItems": 1
|
||||
},
|
||||
},
|
||||
"required": ["@context", "@type", "@id", "name", "provider", "about"]
|
||||
})
|
||||
"anyOf": [
|
||||
{ "required": ["@context", "@type", "@id", "name", "provider", "about"] },
|
||||
{ "required": ["@context", "@type", "@id", "name", "provider", "gsp:hasGeometry"] },
|
||||
{ "required": ["@context", "@type", "@id", "schema:name", "schema:provider", "about"] },
|
||||
{ "required": ["@context", "@type", "@id", "schema:name", "schema:provider", "gsp:hasGeometry"] },
|
||||
// { "required": ["@context", "@type", "@id", "name", "provider"] }
|
||||
]
|
||||
});
|
||||
// Some JSON-LD for datasets (e.g. sciencebase) do not have about or gsp:hasGeometry yet are still valid as per SHACL shape
|
||||
#[cfg(feature = "local")]
|
||||
{
|
||||
let required_array = dataset_schema
|
||||
.get_mut("anyOf")
|
||||
.unwrap()
|
||||
.as_array_mut()
|
||||
.unwrap();
|
||||
required_array.insert(
|
||||
required_array.len(),
|
||||
json!({ "required": ["@context", "@type", "@id", "schema:name", "schema:provider"] }),
|
||||
);
|
||||
}
|
||||
dataset_schema
|
||||
}
|
||||
|
||||
pub fn get_location_schema() -> serde_json::Value {
|
||||
|
|
@ -40,6 +68,9 @@ pub fn get_location_schema() -> serde_json::Value {
|
|||
"geo": {"type": "object"},
|
||||
"gsp:hasGeometry": {"type": "object"}
|
||||
},
|
||||
"required": ["@context", "@type", "@id", "name", "provider", "geo", "gsp:hasGeometry"]
|
||||
"anyOf": [
|
||||
{ "required": ["@context", "@type", "@id", "name", "provider", "geo", "gsp:hasGeometry"] },
|
||||
{ "required": ["@context", "@type", "@id", "schema:name", "schema:provider", "geo", "gsp:hasGeometry"] },
|
||||
]
|
||||
})
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,44 @@
|
|||
use anyhow::{Result, bail};
|
||||
#[cfg(feature = "local")]
|
||||
use ckan_geoconnex_bulk_runner::schema::get_dataset_schema;
|
||||
use ckan_geoconnex_bulk_runner::schema::get_location_schema;
|
||||
use serde_json::json;
|
||||
#[cfg(feature = "local")]
|
||||
use std::{
|
||||
fs::File,
|
||||
io::{BufRead, BufReader},
|
||||
};
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "local")]
|
||||
fn validate_sciencebase_dump() -> Result<()> {
|
||||
let file_path = "./tests/sciencebase_jsonld_dump_202605-06.jsonl";
|
||||
if !std::fs::exists(file_path)? {
|
||||
bail!("File path {file_path} does not exist.")
|
||||
}
|
||||
|
||||
let dataset_json_schema = get_dataset_schema();
|
||||
|
||||
// Read JSONL file line-by-line
|
||||
let file = File::open(file_path)?;
|
||||
let reader = BufReader::new(file);
|
||||
|
||||
let mut line_number = 0;
|
||||
for line in reader.lines() {
|
||||
let jsonld: serde_json::Value = serde_json::from_str(line?.as_str())?;
|
||||
if let Err(e) = jsonschema::validate(&dataset_json_schema, &jsonld) {
|
||||
println!("Error during validation on line {line_number}:");
|
||||
println!("JSON-LD:");
|
||||
println!("{jsonld:#?}");
|
||||
bail!("{e}");
|
||||
} else {
|
||||
println!("Successfully validated line {line_number}.");
|
||||
line_number = line_number + 1;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn validate_usgs_location_jsonld() -> Result<()> {
|
||||
|
|
@ -46,9 +84,9 @@ fn validate_usgs_location_jsonld() -> Result<()> {
|
|||
}
|
||||
});
|
||||
|
||||
let dataset_json_schema = get_location_schema();
|
||||
let location_json_schema = get_location_schema();
|
||||
|
||||
if let Err(e) = jsonschema::validate(&dataset_json_schema, &usgs_location_jsonld) {
|
||||
if let Err(e) = jsonschema::validate(&location_json_schema, &usgs_location_jsonld) {
|
||||
println!("Error during validation:");
|
||||
bail!("{e}");
|
||||
} else {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue