feat: add ckanext-scheming and DP+

This commit is contained in:
rzmk 2025-08-16 01:09:58 -04:00
parent 3dcfd561fa
commit c785e38335
3 changed files with 233 additions and 1 deletions

86
Cargo.lock generated
View file

@ -46,6 +46,7 @@ dependencies = [
"inquire", "inquire",
"owo-colors", "owo-colors",
"rust-ini", "rust-ini",
"serde_json",
"xshell", "xshell",
"xshell-venv", "xshell-venv",
] ]
@ -212,6 +213,12 @@ version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7655c9839580ee829dfacba1d1278c2b7883e50a277ff7541299489d6bdfdc45" checksum = "7655c9839580ee829dfacba1d1278c2b7883e50a277ff7541299489d6bdfdc45"
[[package]]
name = "itoa"
version = "1.0.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
[[package]] [[package]]
name = "libc" name = "libc"
version = "0.2.175" version = "0.2.175"
@ -240,6 +247,12 @@ version = "0.4.27"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94"
[[package]]
name = "memchr"
version = "2.7.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0"
[[package]] [[package]]
name = "mio" name = "mio"
version = "0.8.11" version = "0.8.11"
@ -310,6 +323,24 @@ dependencies = [
"windows-targets 0.52.6", "windows-targets 0.52.6",
] ]
[[package]]
name = "proc-macro2"
version = "1.0.97"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d61789d7719defeb74ea5fe81f2fdfdbd28a803847077cecce2ff14e1472f6f1"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.40"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
dependencies = [
"proc-macro2",
]
[[package]] [[package]]
name = "redox_syscall" name = "redox_syscall"
version = "0.5.17" version = "0.5.17"
@ -342,12 +373,50 @@ dependencies = [
"windows-sys 0.60.2", "windows-sys 0.60.2",
] ]
[[package]]
name = "ryu"
version = "1.0.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
[[package]] [[package]]
name = "scopeguard" name = "scopeguard"
version = "1.2.0" version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]]
name = "serde"
version = "1.0.219"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.219"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "serde_json"
version = "1.0.142"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "030fedb782600dcbd6f02d479bf0d817ac3bb40d644745b769d6a96bc3afc5a7"
dependencies = [
"itoa",
"memchr",
"ryu",
"serde",
]
[[package]] [[package]]
name = "signal-hook" name = "signal-hook"
version = "0.3.18" version = "0.3.18"
@ -403,6 +472,17 @@ dependencies = [
"is_ci", "is_ci",
] ]
[[package]]
name = "syn"
version = "2.0.106"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]] [[package]]
name = "thread_local" name = "thread_local"
version = "1.1.9" version = "1.1.9"
@ -421,6 +501,12 @@ dependencies = [
"crunchy", "crunchy",
] ]
[[package]]
name = "unicode-ident"
version = "1.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
[[package]] [[package]]
name = "unicode-segmentation" name = "unicode-segmentation"
version = "1.12.0" version = "1.12.0"

View file

@ -8,5 +8,6 @@ anyhow = "1.0.99"
inquire = "0.7.5" inquire = "0.7.5"
owo-colors = { version = "4.2.2", features = ["supports-colors"] } owo-colors = { version = "4.2.2", features = ["supports-colors"] }
rust-ini = "0.21.2" rust-ini = "0.21.2"
serde_json = "1.0.142"
xshell = "0.2.7" xshell = "0.2.7"
xshell-venv = "1.3.0" xshell-venv = "1.3.0"

View file

@ -3,6 +3,7 @@ use std::{path::PathBuf, str::FromStr};
use anyhow::Result; use anyhow::Result;
use inquire::Confirm; use inquire::Confirm;
use owo_colors::{OwoColorize, Stream::Stdout}; use owo_colors::{OwoColorize, Stream::Stdout};
use serde_json::json;
use xshell::cmd; use xshell::cmd;
use xshell_venv::{Shell, VirtualEnv}; use xshell_venv::{Shell, VirtualEnv};
@ -176,6 +177,7 @@ POSTGRES_PASSWORD=pass";
"\n{} Enabling DataStore plugin, adding config URLs in /etc/ckan/default/ckan.ini and updating permissions...", "\n{} Enabling DataStore plugin, adding config URLs in /etc/ckan/default/ckan.ini and updating permissions...",
"7.".if_supports_color(Stdout, |text| text.on_magenta().white()), "7.".if_supports_color(Stdout, |text| text.on_magenta().white()),
); );
// TODO: use the ckan config-tool command instead of rust-ini
let mut conf = ini::Ini::load_from_file("/etc/ckan/default/ckan.ini")?; let mut conf = ini::Ini::load_from_file("/etc/ckan/default/ckan.ini")?;
let app_main_section = conf.section_mut(Some("app:main")).unwrap(); let app_main_section = conf.section_mut(Some("app:main")).unwrap();
let mut ckan_plugins = app_main_section.get("ckan.plugins").unwrap().to_string(); let mut ckan_plugins = app_main_section.get("ckan.plugins").unwrap().to_string();
@ -189,6 +191,7 @@ POSTGRES_PASSWORD=pass";
"ckan.datastore.read_url", "ckan.datastore.read_url",
"postgresql://datastore_default:pass@localhost/datastore_default", "postgresql://datastore_default:pass@localhost/datastore_default",
); );
app_main_section.insert("ckan.datastore.sqlsearch.enabled", "true");
conf.write_to_file("/etc/ckan/default/ckan.ini")?; conf.write_to_file("/etc/ckan/default/ckan.ini")?;
let postgres_container_id = cmd!( let postgres_container_id = cmd!(
sh, sh,
@ -204,7 +207,7 @@ POSTGRES_PASSWORD=pass";
loop { loop {
std::thread::sleep(std::time::Duration::from_secs(2)); std::thread::sleep(std::time::Duration::from_secs(2));
if std::fs::exists("permissions.sql")? { if std::fs::exists("permissions.sql")? {
break break;
} }
} }
sh.change_dir(format!("/home/{username}")); sh.change_dir(format!("/home/{username}"));
@ -220,6 +223,148 @@ POSTGRES_PASSWORD=pass";
.if_supports_color(Stdout, |text| text.on_green().white()) .if_supports_color(Stdout, |text| text.on_green().white())
); );
println!(
"\n{} Installing ckanext-scheming and DataPusher+ extensions...",
"8.".if_supports_color(Stdout, |text| text.on_magenta().white()),
);
cmd!(
sh,
"pip install -e git+https://github.com/ckan/ckanext-scheming.git#egg=ckanext-scheming"
)
.run()?;
let mut conf = ini::Ini::load_from_file("/etc/ckan/default/ckan.ini")?;
let app_main_section = conf.section_mut(Some("app:main")).unwrap();
let mut ckan_plugins = app_main_section.get("ckan.plugins").unwrap().to_string();
ckan_plugins.push_str(" scheming_datasets");
app_main_section.insert("ckan.plugins", ckan_plugins);
app_main_section.insert("scheming.presets", "ckanext.scheming:presets.json");
app_main_section.insert("scheming.dataset_fallback", "false");
conf.write_to_file("/etc/ckan/default/ckan.ini")?;
// Install DataPusher+
cmd!(sh, "sudo apt install python3-virtualenv python3-dev python3-pip python3-wheel build-essential libxslt1-dev libxml2-dev zlib1g-dev git libffi-dev libpq-dev uchardet -y").run()?;
sh.change_dir("/usr/lib/ckan/default/src");
cmd!(sh, "pip install -e git+https://github.com/dathere/datapusher-plus.git@main#egg=datapusher-plus").run()?;
sh.change_dir("/usr/lib/ckan/default/src/datapusher-plus");
cmd!(sh, "pip install -r requirements.txt").run()?;
sh.change_dir(format!("/home/{username}"));
cmd!(sh, "wget https://github.com/dathere/qsv/releases/download/6.0.1/qsv-6.0.1-x86_64-unknown-linux-gnu.zip").run()?;
cmd!(sh, "sudo apt install unzip -y").run()?;
cmd!(sh, "unzip qsv-6.0.1-x86_64-unknown-linux-gnu.zip").run()?;
cmd!(sh, "sudo rm -rf qsv-6.0.1-x86_64-unknown-linux-gnu.zip").run()?;
cmd!(sh, "sudo mv ./qsvdp /usr/local/bin").run()?;
let mut conf = ini::Ini::load_from_file("/etc/ckan/default/ckan.ini")?;
let app_main_section = conf.section_mut(Some("app:main")).unwrap();
let mut ckan_plugins = app_main_section.get("ckan.plugins").unwrap().to_string();
ckan_plugins.push_str(" datapusher_plus");
app_main_section.insert("ckan.plugins", ckan_plugins);
app_main_section.insert(
"scheming.dataset_schemas",
"ckanext.datapusher_plus:dataset-druf.yaml",
);
conf.write_to_file("/etc/ckan/default/ckan.ini")?;
let dpp_default_config = r#"
ckanext.datapusher_plus.use_proxy = false
ckanext.datapusher_plus.download_proxy =
ckanext.datapusher_plus.ssl_verify = false
# supports INFO, DEBUG, TRACE - use DEBUG or TRACE when debugging scheming Formulas
ckanext.datapusher_plus.upload_log_level = INFO
ckanext.datapusher_plus.formats = csv tsv tab ssv xls xlsx xlsxb xlsm ods geojson shp qgis zip
ckanext.datapusher_plus.pii_screening = false
ckanext.datapusher_plus.pii_found_abort = false
ckanext.datapusher_plus.pii_regex_resource_id_or_alias =
ckanext.datapusher_plus.pii_show_candidates = false
ckanext.datapusher_plus.pii_quick_screen = false
ckanext.datapusher_plus.qsv_bin = /usr/local/bin/qsvdp
ckanext.datapusher_plus.preview_rows = 100
ckanext.datapusher_plus.download_timeout = 300
ckanext.datapusher_plus.max_content_length = 1256000000000
ckanext.datapusher_plus.chunk_size = 16384
ckanext.datapusher_plus.default_excel_sheet = 0
ckanext.datapusher_plus.sort_and_dupe_check = true
ckanext.datapusher_plus.dedup = false
ckanext.datapusher_plus.unsafe_prefix = unsafe_
ckanext.datapusher_plus.reserved_colnames = _id
ckanext.datapusher_plus.prefer_dmy = false
ckanext.datapusher_plus.ignore_file_hash = true
ckanext.datapusher_plus.auto_index_threshold = 3
ckanext.datapusher_plus.auto_index_dates = true
ckanext.datapusher_plus.auto_unique_index = true
ckanext.datapusher_plus.summary_stats_options =
ckanext.datapusher_plus.add_summary_stats_resource = false
ckanext.datapusher_plus.summary_stats_with_preview = false
ckanext.datapusher_plus.qsv_stats_string_max_length = 32767
ckanext.datapusher_plus.qsv_dates_whitelist = date,time,due,open,close,created
ckanext.datapusher_plus.qsv_freq_limit = 10
ckanext.datapusher_plus.auto_alias = true
ckanext.datapusher_plus.auto_alias_unique = false
ckanext.datapusher_plus.copy_readbuffer_size = 1048576
ckanext.datapusher_plus.type_mapping = {"String": "text", "Integer": "numeric","Float": "numeric","DateTime": "timestamp","Date": "date","NULL": "text"}
ckanext.datapusher_plus.auto_spatial_simplication = true
ckanext.datapusher_plus.spatial_simplication_relative_tolerance = 0.1
ckanext.datapusher_plus.latitude_fields = latitude,lat
ckanext.datapusher_plus.longitude_fields = longitude,long,lon
ckanext.datapusher_plus.jinja2_bytecode_cache_dir = /tmp/jinja2_butecode_cache
ckanext.datapusher_plus.auto_unzip_one_file = true
ckanext.datapusher_plus.api_token = <CKAN service account token for CKAN user with sysadmin privileges>
ckanext.datapusher_plus.describeGPT_api_key = <Token for OpenAI API compatible service>
ckanext.datapusher_plus.file_bin = /usr/bin/file
ckanext.datapusher_plus.enable_druf = true
ckanext.datapusher_plus.enable_form_redirect = true
"#;
std::fs::write("dpp_default_config.ini", dpp_default_config)?;
cmd!(
sh,
"ckan config-tool /etc/ckan/default/ckan.ini -f dpp_default_config.ini"
)
.run()?;
let resource_formats_str =
std::fs::read_to_string("/usr/lib/ckan/default/src/ckan/config/resource_formats.json")?;
let mut resource_formats_val: serde_json::Value =
serde_json::from_str(&resource_formats_str)?;
let all_resource_formats = resource_formats_val
.get_mut(0)
.unwrap()
.as_array_mut()
.unwrap();
all_resource_formats.push(json!([
"TAB",
"Tab Separated Values File",
"text/tab-separated-values",
[]
]));
std::fs::write(
"/usr/lib/ckan/default/src/ckan/config/resource_formats.json",
serde_json::to_string(&resource_formats_val)?,
)?;
// let token_command_output = cmd!(sh, "ckan -c /etc/ckan/default/ckan.ini user token add {username} dpplus | tail -n 1 | tr -d '\t'").read()?;
// let dpp_api_token = duct::cmd!("tr", "-d", "'\t").stdin_bytes(duct::cmd!("tail", "-n", "1").stdin_bytes(token_command_output)).read()?;
cmd!(sh, "sudo locale-gen en_US.UTF-8").run()?;
cmd!(sh, "sudo update-locale").run()?;
let token_command_output = cmd!(
sh,
"ckan -c /etc/ckan/default/ckan.ini user token add {username} dpplus"
)
.read()?;
let tail_output = cmd!(sh, "tail -n 1").stdin(token_command_output).read()?;
let dpp_api_token = cmd!(sh, "tr -d '\t'").stdin(tail_output).read()?;
cmd!(sh, "ckan config-tool /etc/ckan/default/ckan.ini ckanext.datapusher_plus.api_token={dpp_api_token}").run()?;
cmd!(
sh,
"ckan -c /etc/ckan/default/ckan.ini db upgrade -p datapusher_plus"
)
.run()?;
println!(
"{}",
"✅ 8. Installed ckanext-scheming and DataPusher+ extensions."
.if_supports_color(Stdout, |text| text.on_green().white())
);
println!(
"\n{}",
"✅ 9. Running CKAN instance..."
.if_supports_color(Stdout, |text| text.on_green().white())
);
cmd!(sh, "ckan -c /etc/ckan/default/ckan.ini run").run()?; cmd!(sh, "ckan -c /etc/ckan/default/ckan.ini run").run()?;
} }