mirror of
https://github.com/dathere/ckan_geoconnex_bulk_runner.git
synced 2026-07-05 15:12:20 +00:00
feat: initial JSON-LD construction logic, add local test, improve schema
This commit is contained in:
parent
3497994681
commit
d5492cb2ad
7 changed files with 151 additions and 27 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
|
@ -1 +1,3 @@
|
||||||
/target
|
/target
|
||||||
|
# For local tests
|
||||||
|
/tests/*.jsonl
|
||||||
|
|
|
||||||
|
|
@ -9,3 +9,6 @@ ckanaction = "0.2.0"
|
||||||
jsonschema = "0.46.4"
|
jsonschema = "0.46.4"
|
||||||
serde_json = "1.0.149"
|
serde_json = "1.0.149"
|
||||||
tokio = { version = "1.52.1", features = ["full"] }
|
tokio = { version = "1.52.1", features = ["full"] }
|
||||||
|
|
||||||
|
[features]
|
||||||
|
local = []
|
||||||
|
|
|
||||||
|
|
@ -25,3 +25,9 @@ To include print statements in test output, run:
|
||||||
```bash
|
```bash
|
||||||
cargo test -- --nocapture
|
cargo test -- --nocapture
|
||||||
```
|
```
|
||||||
|
|
||||||
|
If you have the local dump files setup available you can run those tests with:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cargo test -F local -- --nocapture
|
||||||
|
```
|
||||||
|
|
|
||||||
|
|
@ -1,13 +1,32 @@
|
||||||
use crate::schema::get_dataset_schema;
|
use serde_json::json;
|
||||||
|
|
||||||
pub fn construct_dataset_jsonld_from_metadata(metadata: serde_json::Value) -> serde_json::Value {
|
pub fn construct_dataset_jsonld_from_metadata(
|
||||||
todo!()
|
dataset_metadata: serde_json::Value,
|
||||||
}
|
) -> serde_json::Value {
|
||||||
|
let dataset_id = dataset_metadata.get("id").unwrap().as_str().unwrap();
|
||||||
pub fn validate_dataset_jsonld(jsonld: serde_json::Value) -> bool {
|
let dataset_name = dataset_metadata.get("name").unwrap().as_str().unwrap();
|
||||||
if let Ok(_) = jsonschema::validate(&get_dataset_schema(), &jsonld) {
|
let organization_name = dataset_metadata
|
||||||
true
|
.get("organization")
|
||||||
} else {
|
.unwrap()
|
||||||
false
|
.get("title")
|
||||||
|
.unwrap();
|
||||||
|
// TODO: Align and include Geoconnex PIDs for reference feature categories to extract PIDs from them
|
||||||
|
// Then also convert spatial_full FeatureCollection to Multipolygon if needed for gsp:hasGeometry when there are
|
||||||
|
// also non-reference feature polygons
|
||||||
|
// if let Some(spatial_full) = dataset_metadata.get("spatial_full") {}
|
||||||
|
let jsonld = json!({
|
||||||
|
"@context": {
|
||||||
|
"@vocab": "https://schema.org/",
|
||||||
|
"gsp": "http://www.opengis.net/ont/geosparql#",
|
||||||
|
},
|
||||||
|
"@type": "Dataset",
|
||||||
|
// TODO: Customize namespace based on CKAN instance being used
|
||||||
|
"@id": format!("https://geoconnex.us/nmwdh/ckan-datasets/{dataset_id}"),
|
||||||
|
"name": dataset_name,
|
||||||
|
"provider": {
|
||||||
|
"@type": "Organization",
|
||||||
|
"name": organization_name
|
||||||
}
|
}
|
||||||
|
});
|
||||||
|
jsonld
|
||||||
}
|
}
|
||||||
|
|
|
||||||
37
src/main.rs
37
src/main.rs
|
|
@ -1,5 +1,7 @@
|
||||||
use anyhow::{Result, bail};
|
use anyhow::{Result, bail};
|
||||||
use ckan_geoconnex_bulk_runner::jsonld::construct_dataset_jsonld_from_metadata;
|
use ckan_geoconnex_bulk_runner::{
|
||||||
|
jsonld::construct_dataset_jsonld_from_metadata, schema::get_dataset_schema,
|
||||||
|
};
|
||||||
|
|
||||||
// TODO: Ensure error output is only streamed to stderr as per Geoconnex docs
|
// TODO: Ensure error output is only streamed to stderr as per Geoconnex docs
|
||||||
|
|
||||||
|
|
@ -42,21 +44,44 @@ async fn main() -> Result<()> {
|
||||||
for dataset_name in dataset_names {
|
for dataset_name in dataset_names {
|
||||||
// 1. Get the dataset's metadata with /package_show by using the dataset name as the id
|
// 1. Get the dataset's metadata with /package_show by using the dataset name as the id
|
||||||
// TODO: Identify if dataset names are unique
|
// TODO: Identify if dataset names are unique
|
||||||
let dataset_metadata = ckan
|
let package_show_response = ckan
|
||||||
.package_show()
|
.package_show()
|
||||||
.id(dataset_name.as_str().unwrap().to_string())
|
.id(dataset_name.as_str().unwrap().to_string())
|
||||||
.call()
|
.call()
|
||||||
.await?;
|
.await?;
|
||||||
println!("{dataset_metadata:#?}");
|
let Some(success) = package_show_response.get("success") else {
|
||||||
|
bail!(
|
||||||
|
"CKAN API did not return success key in /package_show response for dataset {dataset_name}. Full response: {response}"
|
||||||
|
);
|
||||||
|
};
|
||||||
|
if success.as_bool().unwrap() {
|
||||||
|
let Some(dataset_metadata) = package_show_response.get("result") else {
|
||||||
|
bail!(
|
||||||
|
"CKAN API did not return result object in /package_show response for dataset {dataset_name}. Full response: {response}"
|
||||||
|
);
|
||||||
|
};
|
||||||
// 2. Construct JSON-LD based on the data from /package_show
|
// 2. Construct JSON-LD based on the data from /package_show
|
||||||
let jsonld = construct_dataset_jsonld_from_metadata(dataset_metadata);
|
let jsonld =
|
||||||
println!("{jsonld:#?}");
|
construct_dataset_jsonld_from_metadata(dataset_metadata.to_owned());
|
||||||
// 3. Validate the JSON-LD against the dataset JSON schema
|
// 3. Validate the JSON-LD against the dataset JSON schema
|
||||||
|
if jsonschema::validate(&get_dataset_schema(), &jsonld).is_ok() {
|
||||||
// 4. Print the JSON-LD on a new line to stdout
|
// 4. Print the JSON-LD on a new line to stdout
|
||||||
|
println!("{jsonld}");
|
||||||
|
} else {
|
||||||
|
eprintln!("JSON-LD for {dataset_name} is not valid.");
|
||||||
|
eprintln!("{jsonld}");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
bail!(
|
||||||
|
"CKAN API returned {{\"success\": false\"}} for /package_show endpoint on dataset {dataset_name}. Full response: {response}"
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
bail!("CKAN API returned {{\"success\": false\"}}. Full response: {response}");
|
bail!(
|
||||||
|
"CKAN API returned {{\"success\": false\"}} for /package_list endpoint. Full response: {response}"
|
||||||
|
);
|
||||||
}
|
}
|
||||||
offset = offset + limit;
|
offset = offset + limit;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,17 +1,25 @@
|
||||||
use serde_json::json;
|
use serde_json::json;
|
||||||
|
|
||||||
pub fn get_dataset_schema() -> serde_json::Value {
|
pub fn get_dataset_schema() -> serde_json::Value {
|
||||||
json!({
|
// Allow for "local" feature
|
||||||
|
#[allow(unused_mut)]
|
||||||
|
let mut dataset_schema = json!({
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"@context": {"type": ["string", "object"]},
|
"@context": {"type": ["string", "object"]},
|
||||||
"@type": {"type": ["string", "array"], "contains": {"const": "Dataset"}},
|
"@type": {"type": ["string", "array"], "contains": {"anyOf": [{"const": "Dataset"}, {"const": "schema:Dataset"}]}},
|
||||||
"@id": {"type": "string"},
|
"@id": {"type": "string"},
|
||||||
"name": {"type": "string"},
|
"name": {"type": "string"},
|
||||||
|
"schema:name": {"type": "string"},
|
||||||
"provider": {
|
"provider": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {"@type": {"type": "string"}, "name": {"type": "string"}},
|
"properties": {"@type": {"type": "string"}, "name": {"type": "string"}},
|
||||||
},
|
},
|
||||||
|
"schema:provider": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {"@type": {"type": "string"}, "name": {"type": "string"}},
|
||||||
|
},
|
||||||
|
"gsp:hasGeometry": {"@type": "object"},
|
||||||
"about": {
|
"about": {
|
||||||
"type": ["string", "array"],
|
"type": ["string", "array"],
|
||||||
"items": {
|
"items": {
|
||||||
|
|
@ -21,8 +29,28 @@ pub fn get_dataset_schema() -> serde_json::Value {
|
||||||
"minItems": 1
|
"minItems": 1
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
"required": ["@context", "@type", "@id", "name", "provider", "about"]
|
"anyOf": [
|
||||||
})
|
{ "required": ["@context", "@type", "@id", "name", "provider", "about"] },
|
||||||
|
{ "required": ["@context", "@type", "@id", "name", "provider", "gsp:hasGeometry"] },
|
||||||
|
{ "required": ["@context", "@type", "@id", "schema:name", "schema:provider", "about"] },
|
||||||
|
{ "required": ["@context", "@type", "@id", "schema:name", "schema:provider", "gsp:hasGeometry"] },
|
||||||
|
// { "required": ["@context", "@type", "@id", "name", "provider"] }
|
||||||
|
]
|
||||||
|
});
|
||||||
|
// Some JSON-LD for datasets (e.g. sciencebase) do not have about or gsp:hasGeometry yet are still valid as per SHACL shape
|
||||||
|
#[cfg(feature = "local")]
|
||||||
|
{
|
||||||
|
let required_array = dataset_schema
|
||||||
|
.get_mut("anyOf")
|
||||||
|
.unwrap()
|
||||||
|
.as_array_mut()
|
||||||
|
.unwrap();
|
||||||
|
required_array.insert(
|
||||||
|
required_array.len(),
|
||||||
|
json!({ "required": ["@context", "@type", "@id", "schema:name", "schema:provider"] }),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
dataset_schema
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_location_schema() -> serde_json::Value {
|
pub fn get_location_schema() -> serde_json::Value {
|
||||||
|
|
@ -40,6 +68,9 @@ pub fn get_location_schema() -> serde_json::Value {
|
||||||
"geo": {"type": "object"},
|
"geo": {"type": "object"},
|
||||||
"gsp:hasGeometry": {"type": "object"}
|
"gsp:hasGeometry": {"type": "object"}
|
||||||
},
|
},
|
||||||
"required": ["@context", "@type", "@id", "name", "provider", "geo", "gsp:hasGeometry"]
|
"anyOf": [
|
||||||
|
{ "required": ["@context", "@type", "@id", "name", "provider", "geo", "gsp:hasGeometry"] },
|
||||||
|
{ "required": ["@context", "@type", "@id", "schema:name", "schema:provider", "geo", "gsp:hasGeometry"] },
|
||||||
|
]
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,44 @@
|
||||||
use anyhow::{Result, bail};
|
use anyhow::{Result, bail};
|
||||||
|
#[cfg(feature = "local")]
|
||||||
|
use ckan_geoconnex_bulk_runner::schema::get_dataset_schema;
|
||||||
use ckan_geoconnex_bulk_runner::schema::get_location_schema;
|
use ckan_geoconnex_bulk_runner::schema::get_location_schema;
|
||||||
use serde_json::json;
|
use serde_json::json;
|
||||||
|
#[cfg(feature = "local")]
|
||||||
|
use std::{
|
||||||
|
fs::File,
|
||||||
|
io::{BufRead, BufReader},
|
||||||
|
};
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[cfg(feature = "local")]
|
||||||
|
fn validate_sciencebase_dump() -> Result<()> {
|
||||||
|
let file_path = "./tests/sciencebase_jsonld_dump_202605-06.jsonl";
|
||||||
|
if !std::fs::exists(file_path)? {
|
||||||
|
bail!("File path {file_path} does not exist.")
|
||||||
|
}
|
||||||
|
|
||||||
|
let dataset_json_schema = get_dataset_schema();
|
||||||
|
|
||||||
|
// Read JSONL file line-by-line
|
||||||
|
let file = File::open(file_path)?;
|
||||||
|
let reader = BufReader::new(file);
|
||||||
|
|
||||||
|
let mut line_number = 0;
|
||||||
|
for line in reader.lines() {
|
||||||
|
let jsonld: serde_json::Value = serde_json::from_str(line?.as_str())?;
|
||||||
|
if let Err(e) = jsonschema::validate(&dataset_json_schema, &jsonld) {
|
||||||
|
println!("Error during validation on line {line_number}:");
|
||||||
|
println!("JSON-LD:");
|
||||||
|
println!("{jsonld:#?}");
|
||||||
|
bail!("{e}");
|
||||||
|
} else {
|
||||||
|
println!("Successfully validated line {line_number}.");
|
||||||
|
line_number = line_number + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn validate_usgs_location_jsonld() -> Result<()> {
|
fn validate_usgs_location_jsonld() -> Result<()> {
|
||||||
|
|
@ -46,9 +84,9 @@ fn validate_usgs_location_jsonld() -> Result<()> {
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
let dataset_json_schema = get_location_schema();
|
let location_json_schema = get_location_schema();
|
||||||
|
|
||||||
if let Err(e) = jsonschema::validate(&dataset_json_schema, &usgs_location_jsonld) {
|
if let Err(e) = jsonschema::validate(&location_json_schema, &usgs_location_jsonld) {
|
||||||
println!("Error during validation:");
|
println!("Error during validation:");
|
||||||
bail!("{e}");
|
bail!("{e}");
|
||||||
} else {
|
} else {
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue