mirror of
https://github.com/dathere/ckan_geoconnex_bulk_runner.git
synced 2026-07-05 15:12:20 +00:00
feat: enhanced cargo workspace, NM usage, Dockerfile
This commit is contained in:
parent
71b08a53f0
commit
3a79fb2b0a
18 changed files with 362 additions and 2478 deletions
14
geoconnex_utils/Cargo.toml
Normal file
14
geoconnex_utils/Cargo.toml
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
[package]
|
||||
name = "geoconnex_utils"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0.102"
|
||||
ckanaction = "0.2.1"
|
||||
jsonschema = "0.46.4"
|
||||
serde_json = "1.0.149"
|
||||
tokio = { version = "1.52.1", features = ["full"] }
|
||||
|
||||
[features]
|
||||
local = []
|
||||
77
geoconnex_utils/src/jsonld.rs
Normal file
77
geoconnex_utils/src/jsonld.rs
Normal file
|
|
@ -0,0 +1,77 @@
|
|||
use anyhow::{Result, bail};
|
||||
use serde_json::json;
|
||||
|
||||
pub fn construct_dataset_jsonld_from_metadata(
|
||||
dataset_metadata: serde_json::Value,
|
||||
) -> Result<serde_json::Value> {
|
||||
let dataset_id = dataset_metadata.get("id").unwrap().as_str().unwrap();
|
||||
eprintln!("Attempting to construct JSON-LD for dataset {dataset_id}");
|
||||
let dataset_title = dataset_metadata.get("title").unwrap().as_str().unwrap();
|
||||
let organization_name = dataset_metadata
|
||||
.get("organization")
|
||||
.unwrap()
|
||||
.get("title")
|
||||
.unwrap();
|
||||
// TODO: Align and include Geoconnex PIDs for reference feature categories to extract PIDs from them
|
||||
// Then also convert spatial_full FeatureCollection to Multipolygon if needed for gsp:hasGeometry when there are
|
||||
// also non-reference feature polygons
|
||||
let mut about = vec![];
|
||||
if let Some(spatial_full) = dataset_metadata.get("spatial_full") {
|
||||
let Some(spatial_full_str) = spatial_full.as_str() else {
|
||||
bail!("Could not parse spatial_full as string.");
|
||||
};
|
||||
if !spatial_full_str.is_empty() {
|
||||
let Ok(spatial_full_json) = serde_json::from_str::<serde_json::Value>(spatial_full_str)
|
||||
else {
|
||||
bail!(
|
||||
"Error while attempting to deserialize spatial_full string to serde_json::Value."
|
||||
);
|
||||
};
|
||||
let Some(features_value) = spatial_full_json.get("features") else {
|
||||
bail!("Error while attempting to get value of features from spatial_full GeoJSON.");
|
||||
};
|
||||
let Some(features) = features_value.as_array() else {
|
||||
bail!(
|
||||
"Error while attempting to take features value as array from spatial_full GeoJSON."
|
||||
);
|
||||
};
|
||||
for feature in features {
|
||||
let Some(properties) = feature.get("properties") else {
|
||||
bail!(
|
||||
"Error while attempting to get properties from features from spatial_full GeoJSON."
|
||||
);
|
||||
};
|
||||
if let Some(pid) = properties.get("pid") {
|
||||
let Some(pid_string) = pid.as_str() else {
|
||||
bail!("Error while attempting to convert PID as str from &Value.");
|
||||
};
|
||||
about.push(json!({
|
||||
"@id": pid_string,
|
||||
"@type": "Place"
|
||||
}));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
let mut jsonld = json!({
|
||||
"@context": {
|
||||
"@vocab": "https://schema.org/",
|
||||
"gsp": "http://www.opengis.net/ont/geosparql#",
|
||||
},
|
||||
"@type": "Dataset",
|
||||
// TODO: Customize namespace based on CKAN instance being used
|
||||
"@id": format!("https://geoconnex.us/ckan/sandbox/{dataset_id}"),
|
||||
"name": dataset_title,
|
||||
"provider": {
|
||||
"@type": "Organization",
|
||||
"name": organization_name
|
||||
},
|
||||
// TODO: Customize CKAN instance URL based on CKAN instance being used
|
||||
"url": format!("https://sandbox.opendataportal.us/dataset/{dataset_id}")
|
||||
});
|
||||
let jsonld_map = jsonld.as_object_mut().unwrap();
|
||||
if about.len() > 0 {
|
||||
jsonld_map.insert("about".to_string(), serde_json::to_value(about).unwrap());
|
||||
}
|
||||
Ok(serde_json::to_value(jsonld_map).unwrap())
|
||||
}
|
||||
2
geoconnex_utils/src/lib.rs
Normal file
2
geoconnex_utils/src/lib.rs
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
pub mod jsonld;
|
||||
pub mod schema;
|
||||
76
geoconnex_utils/src/schema.rs
Normal file
76
geoconnex_utils/src/schema.rs
Normal file
|
|
@ -0,0 +1,76 @@
|
|||
use serde_json::json;
|
||||
|
||||
pub fn get_dataset_schema() -> serde_json::Value {
|
||||
// Allow for "local" feature
|
||||
#[allow(unused_mut)]
|
||||
let mut dataset_schema = json!({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"@context": {"type": ["string", "object"]},
|
||||
"@type": {"type": ["string", "array"], "contains": {"anyOf": [{"const": "Dataset"}, {"const": "schema:Dataset"}]}},
|
||||
"@id": {"type": "string"},
|
||||
"name": {"type": "string"},
|
||||
"schema:name": {"type": "string"},
|
||||
"provider": {
|
||||
"type": "object",
|
||||
"properties": {"@type": {"type": "string"}, "name": {"type": "string"}},
|
||||
},
|
||||
"schema:provider": {
|
||||
"type": "object",
|
||||
"properties": {"@type": {"type": "string"}, "name": {"type": "string"}},
|
||||
},
|
||||
"gsp:hasGeometry": {"@type": "object"},
|
||||
"about": {
|
||||
"type": ["string", "array"],
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {"@id": {"type": "string"}, "@type": {"const": "Place"}},
|
||||
},
|
||||
"minItems": 1
|
||||
},
|
||||
},
|
||||
"anyOf": [
|
||||
{ "required": ["@context", "@type", "@id", "name", "provider", "about"] },
|
||||
{ "required": ["@context", "@type", "@id", "name", "provider", "gsp:hasGeometry"] },
|
||||
{ "required": ["@context", "@type", "@id", "schema:name", "schema:provider", "about"] },
|
||||
{ "required": ["@context", "@type", "@id", "schema:name", "schema:provider", "gsp:hasGeometry"] },
|
||||
// { "required": ["@context", "@type", "@id", "name", "provider"] }
|
||||
]
|
||||
});
|
||||
// Some JSON-LD for datasets (e.g. sciencebase) do not have about or gsp:hasGeometry yet are still valid as per SHACL shape
|
||||
#[cfg(feature = "local")]
|
||||
{
|
||||
let required_array = dataset_schema
|
||||
.get_mut("anyOf")
|
||||
.unwrap()
|
||||
.as_array_mut()
|
||||
.unwrap();
|
||||
required_array.insert(
|
||||
required_array.len(),
|
||||
json!({ "required": ["@context", "@type", "@id", "schema:name", "schema:provider"] }),
|
||||
);
|
||||
}
|
||||
dataset_schema
|
||||
}
|
||||
|
||||
pub fn get_location_schema() -> serde_json::Value {
|
||||
json!({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"@context": {"type": ["string", "object"]},
|
||||
"@type": {"type": ["string", "array"], "contains": {"const": "Place"}},
|
||||
"@id": {"type": "string"},
|
||||
"name": {"type": "string"},
|
||||
"provider": {
|
||||
"type": "object",
|
||||
"properties": {"@type": {"type": "string"}, "name": {"type": "string"}},
|
||||
},
|
||||
"geo": {"type": "object"},
|
||||
"gsp:hasGeometry": {"type": "object"}
|
||||
},
|
||||
"anyOf": [
|
||||
{ "required": ["@context", "@type", "@id", "name", "provider", "geo", "gsp:hasGeometry"] },
|
||||
{ "required": ["@context", "@type", "@id", "schema:name", "schema:provider", "geo", "gsp:hasGeometry"] },
|
||||
]
|
||||
})
|
||||
}
|
||||
97
geoconnex_utils/tests/validate_jsonld.rs
Normal file
97
geoconnex_utils/tests/validate_jsonld.rs
Normal file
|
|
@ -0,0 +1,97 @@
|
|||
use anyhow::{bail, Result};
|
||||
#[cfg(feature = "local")]
|
||||
use geoconnex_utils::schema::get_dataset_schema;
|
||||
use geoconnex_utils::schema::get_location_schema;
|
||||
use serde_json::json;
|
||||
#[cfg(feature = "local")]
|
||||
use std::{
|
||||
fs::File,
|
||||
io::{BufRead, BufReader},
|
||||
};
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "local")]
|
||||
fn validate_sciencebase_dump() -> Result<()> {
|
||||
let file_path = "./tests/sciencebase_jsonld_dump_202605-06.jsonl";
|
||||
if !std::fs::exists(file_path)? {
|
||||
bail!("File path {file_path} does not exist.")
|
||||
}
|
||||
|
||||
let dataset_json_schema = get_dataset_schema();
|
||||
|
||||
// Read JSONL file line-by-line
|
||||
let file = File::open(file_path)?;
|
||||
let reader = BufReader::new(file);
|
||||
|
||||
let mut line_number = 0;
|
||||
for line in reader.lines() {
|
||||
let jsonld: serde_json::Value = serde_json::from_str(line?.as_str())?;
|
||||
if let Err(e) = jsonschema::validate(&dataset_json_schema, &jsonld) {
|
||||
println!("Error during validation on line {line_number}:");
|
||||
println!("JSON-LD:");
|
||||
println!("{jsonld:#?}");
|
||||
bail!("{e}");
|
||||
} else {
|
||||
println!("Successfully validated line {line_number}.");
|
||||
line_number = line_number + 1;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn validate_usgs_location_jsonld() -> Result<()> {
|
||||
let usgs_location_jsonld = json!({
|
||||
"@context": {
|
||||
"@vocab": "https://schema.org/",
|
||||
"gsp": "http://www.opengis.net/ont/geosparql#",
|
||||
"hyf": "https://www.opengis.net/def/schema/hy_features/hyf/",
|
||||
"locType": "https://api.waterdata.usgs.gov/ogcapi/v0/collections/site-types/items/"
|
||||
},
|
||||
"@type": [
|
||||
"Place",
|
||||
"hyf:HY_HydrometricFeature",
|
||||
"hyf:HY_HydroLocation",
|
||||
"locType:ST-CA"
|
||||
],
|
||||
"@id": "https://geoconnex.us/usgs/monitoring-location/USGS-253937080285200",
|
||||
"name": "BLACKCREEKCANALWESTOFSOUTHMIAMI FLA",
|
||||
"identifier": {
|
||||
"@type": "PropertyValue",
|
||||
"propertyID": "USGS site identifier",
|
||||
"value": "253937080285200"
|
||||
},
|
||||
"url": "https://api.waterdata.usgs.gov/ogcapi/v0/collections/monitoring-locations/items/USGS-253937080285200",
|
||||
"provider": {
|
||||
"@type": "GovernmentOrganization",
|
||||
"name": "U.S. Geological Survey"
|
||||
},
|
||||
"geo": {
|
||||
"@type": "GeoCoordinates",
|
||||
"latitude": 25.6606597832648,
|
||||
"longitude": -80.4808896071386
|
||||
},
|
||||
"gsp:hasGeometry": {
|
||||
"@type": "http://www.opengis.net/ont/sf#Point",
|
||||
"gsp:asWKT": {
|
||||
"@type": "gsp:wktLiteral",
|
||||
"@value": "POINT (-80.4808896071386 25.6606597832648)"
|
||||
},
|
||||
"gsp:crs": {
|
||||
"@id": "http://www.opengis.net/def/crs/OGC/1.3/CRS84"
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
let location_json_schema = get_location_schema();
|
||||
|
||||
if let Err(e) = jsonschema::validate(&location_json_schema, &usgs_location_jsonld) {
|
||||
println!("Error during validation:");
|
||||
bail!("{e}");
|
||||
} else {
|
||||
println!("Successfully validated.");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue