feat: enhanced cargo workspace, NM usage, Dockerfile

This commit is contained in:
rzmk 2026-06-15 11:20:51 -04:00
parent 71b08a53f0
commit 3a79fb2b0a
18 changed files with 362 additions and 2478 deletions

View file

@ -0,0 +1,14 @@
[package]
name = "geoconnex_utils"
version = "0.1.0"
edition = "2024"
[dependencies]
anyhow = "1.0.102"
ckanaction = "0.2.1"
jsonschema = "0.46.4"
serde_json = "1.0.149"
tokio = { version = "1.52.1", features = ["full"] }
[features]
local = []

View file

@ -0,0 +1,77 @@
use anyhow::{Result, bail};
use serde_json::json;
pub fn construct_dataset_jsonld_from_metadata(
dataset_metadata: serde_json::Value,
) -> Result<serde_json::Value> {
let dataset_id = dataset_metadata.get("id").unwrap().as_str().unwrap();
eprintln!("Attempting to construct JSON-LD for dataset {dataset_id}");
let dataset_title = dataset_metadata.get("title").unwrap().as_str().unwrap();
let organization_name = dataset_metadata
.get("organization")
.unwrap()
.get("title")
.unwrap();
// TODO: Align and include Geoconnex PIDs for reference feature categories to extract PIDs from them
// Then also convert spatial_full FeatureCollection to Multipolygon if needed for gsp:hasGeometry when there are
// also non-reference feature polygons
let mut about = vec![];
if let Some(spatial_full) = dataset_metadata.get("spatial_full") {
let Some(spatial_full_str) = spatial_full.as_str() else {
bail!("Could not parse spatial_full as string.");
};
if !spatial_full_str.is_empty() {
let Ok(spatial_full_json) = serde_json::from_str::<serde_json::Value>(spatial_full_str)
else {
bail!(
"Error while attempting to deserialize spatial_full string to serde_json::Value."
);
};
let Some(features_value) = spatial_full_json.get("features") else {
bail!("Error while attempting to get value of features from spatial_full GeoJSON.");
};
let Some(features) = features_value.as_array() else {
bail!(
"Error while attempting to take features value as array from spatial_full GeoJSON."
);
};
for feature in features {
let Some(properties) = feature.get("properties") else {
bail!(
"Error while attempting to get properties from features from spatial_full GeoJSON."
);
};
if let Some(pid) = properties.get("pid") {
let Some(pid_string) = pid.as_str() else {
bail!("Error while attempting to convert PID as str from &Value.");
};
about.push(json!({
"@id": pid_string,
"@type": "Place"
}));
}
}
}
}
let mut jsonld = json!({
"@context": {
"@vocab": "https://schema.org/",
"gsp": "http://www.opengis.net/ont/geosparql#",
},
"@type": "Dataset",
// TODO: Customize namespace based on CKAN instance being used
"@id": format!("https://geoconnex.us/ckan/sandbox/{dataset_id}"),
"name": dataset_title,
"provider": {
"@type": "Organization",
"name": organization_name
},
// TODO: Customize CKAN instance URL based on CKAN instance being used
"url": format!("https://sandbox.opendataportal.us/dataset/{dataset_id}")
});
let jsonld_map = jsonld.as_object_mut().unwrap();
if about.len() > 0 {
jsonld_map.insert("about".to_string(), serde_json::to_value(about).unwrap());
}
Ok(serde_json::to_value(jsonld_map).unwrap())
}

View file

@ -0,0 +1,2 @@
pub mod jsonld;
pub mod schema;

View file

@ -0,0 +1,76 @@
use serde_json::json;
pub fn get_dataset_schema() -> serde_json::Value {
// Allow for "local" feature
#[allow(unused_mut)]
let mut dataset_schema = json!({
"type": "object",
"properties": {
"@context": {"type": ["string", "object"]},
"@type": {"type": ["string", "array"], "contains": {"anyOf": [{"const": "Dataset"}, {"const": "schema:Dataset"}]}},
"@id": {"type": "string"},
"name": {"type": "string"},
"schema:name": {"type": "string"},
"provider": {
"type": "object",
"properties": {"@type": {"type": "string"}, "name": {"type": "string"}},
},
"schema:provider": {
"type": "object",
"properties": {"@type": {"type": "string"}, "name": {"type": "string"}},
},
"gsp:hasGeometry": {"@type": "object"},
"about": {
"type": ["string", "array"],
"items": {
"type": "object",
"properties": {"@id": {"type": "string"}, "@type": {"const": "Place"}},
},
"minItems": 1
},
},
"anyOf": [
{ "required": ["@context", "@type", "@id", "name", "provider", "about"] },
{ "required": ["@context", "@type", "@id", "name", "provider", "gsp:hasGeometry"] },
{ "required": ["@context", "@type", "@id", "schema:name", "schema:provider", "about"] },
{ "required": ["@context", "@type", "@id", "schema:name", "schema:provider", "gsp:hasGeometry"] },
// { "required": ["@context", "@type", "@id", "name", "provider"] }
]
});
// Some JSON-LD for datasets (e.g. sciencebase) do not have about or gsp:hasGeometry yet are still valid as per SHACL shape
#[cfg(feature = "local")]
{
let required_array = dataset_schema
.get_mut("anyOf")
.unwrap()
.as_array_mut()
.unwrap();
required_array.insert(
required_array.len(),
json!({ "required": ["@context", "@type", "@id", "schema:name", "schema:provider"] }),
);
}
dataset_schema
}
pub fn get_location_schema() -> serde_json::Value {
json!({
"type": "object",
"properties": {
"@context": {"type": ["string", "object"]},
"@type": {"type": ["string", "array"], "contains": {"const": "Place"}},
"@id": {"type": "string"},
"name": {"type": "string"},
"provider": {
"type": "object",
"properties": {"@type": {"type": "string"}, "name": {"type": "string"}},
},
"geo": {"type": "object"},
"gsp:hasGeometry": {"type": "object"}
},
"anyOf": [
{ "required": ["@context", "@type", "@id", "name", "provider", "geo", "gsp:hasGeometry"] },
{ "required": ["@context", "@type", "@id", "schema:name", "schema:provider", "geo", "gsp:hasGeometry"] },
]
})
}

View file

@ -0,0 +1,97 @@
use anyhow::{bail, Result};
#[cfg(feature = "local")]
use geoconnex_utils::schema::get_dataset_schema;
use geoconnex_utils::schema::get_location_schema;
use serde_json::json;
#[cfg(feature = "local")]
use std::{
fs::File,
io::{BufRead, BufReader},
};
#[test]
#[cfg(feature = "local")]
fn validate_sciencebase_dump() -> Result<()> {
let file_path = "./tests/sciencebase_jsonld_dump_202605-06.jsonl";
if !std::fs::exists(file_path)? {
bail!("File path {file_path} does not exist.")
}
let dataset_json_schema = get_dataset_schema();
// Read JSONL file line-by-line
let file = File::open(file_path)?;
let reader = BufReader::new(file);
let mut line_number = 0;
for line in reader.lines() {
let jsonld: serde_json::Value = serde_json::from_str(line?.as_str())?;
if let Err(e) = jsonschema::validate(&dataset_json_schema, &jsonld) {
println!("Error during validation on line {line_number}:");
println!("JSON-LD:");
println!("{jsonld:#?}");
bail!("{e}");
} else {
println!("Successfully validated line {line_number}.");
line_number = line_number + 1;
}
}
Ok(())
}
#[test]
fn validate_usgs_location_jsonld() -> Result<()> {
let usgs_location_jsonld = json!({
"@context": {
"@vocab": "https://schema.org/",
"gsp": "http://www.opengis.net/ont/geosparql#",
"hyf": "https://www.opengis.net/def/schema/hy_features/hyf/",
"locType": "https://api.waterdata.usgs.gov/ogcapi/v0/collections/site-types/items/"
},
"@type": [
"Place",
"hyf:HY_HydrometricFeature",
"hyf:HY_HydroLocation",
"locType:ST-CA"
],
"@id": "https://geoconnex.us/usgs/monitoring-location/USGS-253937080285200",
"name": "BLACKCREEKCANALWESTOFSOUTHMIAMI FLA",
"identifier": {
"@type": "PropertyValue",
"propertyID": "USGS site identifier",
"value": "253937080285200"
},
"url": "https://api.waterdata.usgs.gov/ogcapi/v0/collections/monitoring-locations/items/USGS-253937080285200",
"provider": {
"@type": "GovernmentOrganization",
"name": "U.S. Geological Survey"
},
"geo": {
"@type": "GeoCoordinates",
"latitude": 25.6606597832648,
"longitude": -80.4808896071386
},
"gsp:hasGeometry": {
"@type": "http://www.opengis.net/ont/sf#Point",
"gsp:asWKT": {
"@type": "gsp:wktLiteral",
"@value": "POINT (-80.4808896071386 25.6606597832648)"
},
"gsp:crs": {
"@id": "http://www.opengis.net/def/crs/OGC/1.3/CRS84"
}
}
});
let location_json_schema = get_location_schema();
if let Err(e) = jsonschema::validate(&location_json_schema, &usgs_location_jsonld) {
println!("Error during validation:");
bail!("{e}");
} else {
println!("Successfully validated.");
}
Ok(())
}