diff --git a/.gitignore b/.gitignore index d4f1541..11a5e2e 100644 Binary files a/.gitignore and b/.gitignore differ diff --git a/DESCRIPTION b/DESCRIPTION index b93bd23..92ed230 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -24,7 +24,8 @@ Suggests: maps, quarto, rmarkdown, - testthat (>= 3.0.0) + testthat (>= 3.0.0), + yaml, VignetteBuilder: quarto SystemRequirements: Quarto command line tool (https://github.com/quarto-dev/quarto-cli) Encoding: UTF-8 diff --git a/NAMESPACE b/NAMESPACE index 6ae9268..a7cc05c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,2 +1,4 @@ # Generated by roxygen2: do not edit by hand +export(validate_all) +export(validate_custom) diff --git a/R/validate.R b/R/validate.R new file mode 100644 index 0000000..1b0e7fc --- /dev/null +++ b/R/validate.R @@ -0,0 +1,320 @@ +# R/validate.R +# +# Master validation entry point for the betydata package. +# +# Runs two layers of constraint checks: +# +# Layer 1 — Frictionless-native constraints +# Declared in datapackage.json: required, minimum/maximum, enum, +# unique, primaryKey, foreignKeys. +# Checked by inspecting the loaded data frames directly against the +# schema (avoids re-reading CSVs; the data is already in memory). +# +# Layer 2 — Custom constraints +# Declared in data-raw/custom_constraints.yaml. +# Covers composite cross-field rules and cross-table range lookups +# that Frictionless Table Schema cannot express natively. +# Implemented in R/validate_custom.R. +# +# Typical usage from data-raw/make-data.R: +# +# source("R/validate_custom.R") +# source("R/validate.R") +# tables <- list( +# sites = sites, +# traitsview = traitsview, +# cultivars = cultivars, +# variables = variables +# ) +# validate_all(tables) + +#' Validate Frictionless-native field-level constraints from datapackage.json +#' +#' Reads the schema from datapackage.json and checks each field's declared +#' constraints (required, minimum, maximum, enum, unique) against the +#' supplied data frame. Foreign key and primaryKey checks are handled +#' separately in validate_foreign_keys(). +#' +#' @param df data frame to validate +#' @param schema the "schema" list for this resource from datapackage.json +#' @param table_name table name string for error messages +#' @return character vector of error messages (empty if all pass) +#' @keywords internal +validate_frictionless_fields <- function(df, schema, table_name) { + errors <- character(0) + fields <- schema$fields + if (is.null(fields)) return(errors) + + for (field in fields) { + fname <- field$name + ftype <- field$type + fcons <- field$constraints + if (is.null(fcons)) next + if (!fname %in% names(df)) next # column not present; schema mismatch caught elsewhere + + col <- df[[fname]] + + # required + if (isTRUE(fcons$required)) { + na_count <- sum(is.na(col)) + if (na_count > 0) { + errors <- c(errors, sprintf( + "[%s.%s] required field has %d NA value(s)", table_name, fname, na_count + )) + } + } + + # unique (single-field; compound uniqueness via primaryKey handled separately) + if (isTRUE(fcons$unique)) { + non_na <- col[!is.na(col)] + if (anyDuplicated(non_na) > 0) { + errors <- c(errors, sprintf( + "[%s.%s] unique constraint violated (%d duplicate values)", + table_name, fname, sum(duplicated(non_na)) + )) + } + } + + # minimum / maximum (numeric and date fields) + col_num <- suppressWarnings(as.numeric(col)) + + if (!is.null(fcons$minimum)) { + bad <- !is.na(col_num) & col_num < fcons$minimum + if (any(bad)) { + errors <- c(errors, sprintf( + "[%s.%s] %d value(s) below minimum (%s)", + table_name, fname, sum(bad), fcons$minimum + )) + } + } + + if (!is.null(fcons$maximum)) { + bad <- !is.na(col_num) & col_num > fcons$maximum + if (any(bad)) { + errors <- c(errors, sprintf( + "[%s.%s] %d value(s) above maximum (%s)", + table_name, fname, sum(bad), fcons$maximum + )) + } + } + + # enum + if (!is.null(fcons$enum)) { + allowed <- unlist(fcons$enum) + bad <- !is.na(col) & !col %in% allowed + if (any(bad)) { + errors <- c(errors, sprintf( + "[%s.%s] %d value(s) not in allowed set {%s}", + table_name, fname, sum(bad), paste(allowed, collapse = ", ") + )) + } + } + } + errors +} + +#' Validate primaryKey uniqueness for a table +#' +#' @param df data frame +#' @param schema the schema list for this resource +#' @param table_name string for error messages +#' @return character vector of error messages +#' @keywords internal +validate_primary_key <- function(df, schema, table_name) { + errors <- character(0) + pk <- schema$primaryKey + if (is.null(pk)) return(errors) + + pk_cols <- unlist(pk) + present <- pk_cols[pk_cols %in% names(df)] + if (length(present) < length(pk_cols)) return(errors) + + dupes <- duplicated(df[, present, drop = FALSE]) + if (any(dupes)) { + errors <- c(errors, sprintf( + "[%s] primaryKey (%s) has %d duplicate combination(s)", + table_name, paste(pk_cols, collapse = ", "), sum(dupes) + )) + } + errors +} + +#' Validate foreign key referential integrity across tables +#' +#' OPTIMIZED: Uses fast set membership checking instead of row-by-row comparison. +#' +#' @param tables named list of all loaded data frames +#' @param schema the schema list for the child resource +#' @param table_name name of the child table +#' @return character vector of error messages +#' @keywords internal +validate_foreign_keys <- function(tables, schema, table_name) { + errors <- character(0) + fks <- schema$foreignKeys + if (is.null(fks)) return(errors) + + df <- tables[[table_name]] + + for (fk in fks) { + child_cols <- unlist(fk$fields) + ref_resource <- fk$reference$resource + ref_cols <- unlist(fk$reference$fields) + + ref_df <- tables[[ref_resource]] + if (is.null(ref_df)) next # referenced table not loaded; skip silently + + if (!all(child_cols %in% names(df))) next + if (!all(ref_cols %in% names(ref_df))) next + + # OPTIMIZATION: For single-column FKs, use fast %in% operator + if (length(child_cols) == 1) { + child_vals <- df[[child_cols[1]]] + ref_vals <- ref_df[[ref_cols[1]]] + + bad <- !is.na(child_vals) & !child_vals %in% ref_vals + if (any(bad)) { + bad_rows <- which(bad) + errors <- c(errors, sprintf( + "[%s] foreign key %s -> %s(%s): %d value(s) not found in reference table (row indices: %s)", + table_name, + child_cols[1], + ref_resource, + ref_cols[1], + sum(bad), + paste(head(bad_rows, 10), collapse = ", ") + )) + } + } else { + # Multi-column FK: create composite keys as paste-separated strings + child_key <- do.call(paste, c(df[, child_cols, drop = FALSE], sep = "||")) + ref_key <- do.call(paste, c(ref_df[, ref_cols, drop = FALSE], sep = "||")) + + bad <- !is.na(child_key) & !child_key %in% ref_key + if (any(bad)) { + bad_rows <- which(bad) + errors <- c(errors, sprintf( + "[%s] composite foreign key (%s) -> %s(%s): %d value(s) not found in reference table", + table_name, + paste(child_cols, collapse = ", "), + ref_resource, + paste(ref_cols, collapse = ", "), + sum(bad) + )) + } + } + } + errors +} + +#' Run all Frictionless-layer validation across all loaded tables +#' +#' @param tables named list of data frames (ONLY constrained tables) +#' @param datapackage_path path to datapackage.json +#' @return named list of character vectors; each name is a table with errors +#' @keywords internal +validate_frictionless_layer <- function( + tables, + datapackage_path = "datapackage.json") { + + if (!file.exists(datapackage_path)) { + message("datapackage.json not found at: ", datapackage_path, + " — skipping Frictionless layer validation") + return(list()) + } + + dp <- jsonlite::read_json(datapackage_path) + results <- list() + + for (resource in dp$resources) { + tname <- resource$name + schema <- resource$schema + if (is.null(schema) || !tname %in% names(tables)) next + + df <- tables[[tname]] + errors <- character(0) + + errors <- c(errors, validate_frictionless_fields(df, schema, tname)) + errors <- c(errors, validate_primary_key(df, schema, tname)) + errors <- c(errors, validate_foreign_keys(tables, schema, tname)) + + if (length(errors) > 0) results[[tname]] <- errors + } + results +} + +#' Run all validation layers and report results +#' +#' This is the main entry point. It runs both the Frictionless-native +#' constraint checks and the custom constraint checks, then collates +#' results and either stops (on hard errors) or messages warnings. +#' +#' @param tables Named list of data frames. Names must match resource names +#' in datapackage.json and table names in custom_constraints.yaml. +#' Only include tables that have constraints; exclude junction tables. +#' @param datapackage_path Path to datapackage.json (default: repo root). +#' @param constraints_path Path to custom_constraints.yaml. +#' @param stop_on_error If TRUE (default), calls stop() when errors are found. +#' Set to FALSE to return results without stopping (useful in tests). +#' @return Invisibly returns a named list of error vectors per table. +#' Empty list means all checks passed. +#' +#' @export +validate_all <- function( + tables, + datapackage_path = "datapackage.json", + constraints_path = "data-raw/custom_constraints.yaml", + stop_on_error = TRUE) { + + # Validate only the constrained tables passed to this function. + # Junction/lookup tables should already be filtered by the caller + # (in data-raw/make-data.R) to avoid wasting validation time on tables + # that have no constraints defined. + + if (length(tables) == 0) { + message("No tables to validate.") + return(invisible(list())) + } + + message(sprintf("Running Layer 1: Frictionless-native constraint checks (%d tables)...", + length(tables))) + fl_errors <- validate_frictionless_layer(tables, datapackage_path) + + message("Running Layer 2: Custom constraint checks (composite + cross-table)...") + custom_errors <- validate_custom(tables, constraints_path) + + # Merge results + all_tables <- union(names(fl_errors), names(custom_errors)) + if (length(all_tables) == 0) { + message("✓ All validation checks passed.") + return(invisible(list())) + } + + results <- setNames( + lapply(all_tables, function(tname) { + c(fl_errors[[tname]], custom_errors[[tname]]) + }), + all_tables + ) + results <- results[lengths(results) > 0] + + # Report errors + total <- sum(lengths(results)) + message(sprintf("✗ Validation found %d error(s) across %d table(s):", + total, length(results))) + for (tname in names(results)) { + message(sprintf(" [%s] — %d error(s)", tname, length(results[[tname]]))) + for (err in head(results[[tname]], 5)) { # Show first 5 per table + message(" - ", err) + } + if (length(results[[tname]]) > 5) { + message(sprintf(" ... and %d more", length(results[[tname]]) - 5)) + } + } + + if (stop_on_error) { + stop("Data validation failed. Fix errors before saving package data.", + call. = FALSE) + } + + invisible(results) +} \ No newline at end of file diff --git a/R/validate_custom.R b/R/validate_custom.R new file mode 100644 index 0000000..03c5f06 --- /dev/null +++ b/R/validate_custom.R @@ -0,0 +1,307 @@ +# R/validate_custom.R +# +# Validation functions for BETYdb constraints that cannot be expressed in +# Frictionless Table Schema: cross-field arithmetic rules, conditional +# if/then rules, and cross-table lookup range checks. +# +# These implement the logic originally enforced by PostgreSQL CHECK constraints +# and trigger functions in the bety Rails application. +# +# The YAML file data-raw/custom_constraints.yaml is the single source of +# truth for what constraints exist and why. These functions interpret that file. + +#' Load the custom constraints definition file +#' +#' @param path Path to custom_constraints.yaml. During development, this should +#' point to data-raw/custom_constraints.yaml. After package installation, +#' it points to system.file("extdata", "custom_constraints.yaml", package = "betydata"). +#' @return A named list parsed from the YAML file. +#' @keywords internal +load_custom_constraints <- function( + path = "data-raw/custom_constraints.yaml") { + + # Try development path first, then installed package path + if (!file.exists(path)) { + path <- system.file("extdata", "custom_constraints.yaml", package = "betydata") + } + + if (!nzchar(path) || !file.exists(path)) { + warning("custom_constraints.yaml not found at: ", path, + " — skipping custom constraint validation") + return(list()) + } + + yaml::read_yaml(path) +} + +# --------------------------------------------------------------------------- +# Composite constraint runners (within a single table) +# --------------------------------------------------------------------------- + +#' Run a sum_limit constraint +#' +#' Checks that the row-wise sum of specified columns does not exceed a value. +#' Example: sand_pct + clay_pct <= 100 +#' +#' @param df data frame +#' @param con a single composite_constraint list element of type "sum_limit" +#' @return character vector of error messages (empty if all rows pass) +#' @keywords internal +run_sum_limit <- function(df, con) { + cols <- con$columns + present <- cols[cols %in% names(df)] + if (length(present) == 0) return(character(0)) + + row_sums <- rowSums(df[, present, drop = FALSE], na.rm = FALSE) + + # Only check rows where ALL columns are non-NA (skip rows with any NA) + all_present <- rowSums(!is.na(df[, present, drop = FALSE])) == length(present) + bad_rows <- which(all_present & row_sums > con$value) + + if (length(bad_rows) == 0) return(character(0)) + + sprintf("[%s] %s (%d rows, indices: %s)", + con$id, con$message, length(bad_rows), + paste(head(bad_rows, 10), collapse = ", ")) +} + +#' Run a conditional constraint (if field A is set, field B must also be set) +#' +#' @param df data frame +#' @param con a single composite_constraint list element of type "conditional" +#' @return character vector of error messages (empty if all rows pass) +#' @keywords internal +run_conditional <- function(df, con) { + if_col <- con[["if"]][["column"]] + then_col <- con[["then"]][["column"]] + + if (!all(c(if_col, then_col) %in% names(df))) return(character(0)) + + if_cond <- con[["if"]][["condition"]] + then_cond <- con[["then"]][["condition"]] + + # Evaluate the IF trigger + trigger <- switch(if_cond, + "not_null" = { + !is.na(df[[if_col]]) + }, + "not_null_and_not_empty" = { + !is.na(df[[if_col]]) & nzchar(as.character(df[[if_col]])) + }, + rep(FALSE, nrow(df)) + ) + + # Evaluate the THEN requirement + satisfied <- switch(then_cond, + "not_null" = { + !is.na(df[[then_col]]) + }, + "not_null_and_not_empty" = { + !is.na(df[[then_col]]) & nzchar(as.character(df[[then_col]])) + }, + rep(TRUE, nrow(df)) + ) + + bad_rows <- which(trigger & !satisfied) + if (length(bad_rows) == 0) return(character(0)) + + sprintf("[%s] %s (%d rows, indices: %s)", + con$id, con$message, length(bad_rows), + paste(head(bad_rows, 10), collapse = ", ")) +} + +#' Run a unique_combination constraint +#' +#' Checks that no two rows share the same values in ALL specified columns. +#' Example: (name, specie_id) must be unique in cultivars. +#' +#' @param df data frame +#' @param con a single composite_constraint list element of type +#' "unique_combination" +#' @return character vector of error messages (empty if all rows pass) +#' @keywords internal +run_unique_combination <- function(df, con) { + cols <- con$columns + present <- cols[cols %in% names(df)] + if (length(present) == 0) return(character(0)) + + dupes <- duplicated(df[, present, drop = FALSE]) | + duplicated(df[, present, drop = FALSE], fromLast = TRUE) + bad_rows <- which(dupes) + + if (length(bad_rows) == 0) return(character(0)) + + sprintf("[%s] %s (%d rows, indices: %s)", + con$id, con$message, length(bad_rows), + paste(head(bad_rows, 10), collapse = ", ")) +} + +#' Dispatch and run all composite_constraints for a single table +#' +#' @param df data frame for the table being validated +#' @param constraints the composite_constraints list from the YAML for this table +#' @return character vector of all error messages +#' @keywords internal +run_composite_constraints <- function(df, constraints) { + errors <- character(0) + for (con in constraints) { + result <- switch(con$type, + "sum_limit" = run_sum_limit(df, con), + "conditional" = run_conditional(df, con), + "unique_combination" = run_unique_combination(df, con), + "all_or_none" = character(0), # documented only; geometry handled elsewhere + character(0) + ) + errors <- c(errors, result) + } + errors +} + +# --------------------------------------------------------------------------- +# Custom constraint runners (cross-table) +# --------------------------------------------------------------------------- + +#' Run a conditional_range constraint (cross-table lookup) +#' +#' Checks that a column's values fall within the min/max bounds defined for +#' each row's linked record in another table. Replicates the logic of the +#' PostgreSQL restrict_trait_range trigger. +#' +#' OPTIMIZED: Uses vector operations and match() instead of merge() to avoid +#' creating large intermediate dataframes. +#' +#' @param df data frame for the table being validated (e.g. traitsview) +#' @param lookup_df data frame of the lookup table (e.g. variables) +#' @param con a single custom_constraint list element of type +#' "conditional_range" +#' @return character vector of error messages (empty if all rows pass) +#' @keywords internal +run_conditional_range <- function(df, lookup_df, con) { + val_col <- con$column + join_col <- con$join_on + key_col <- con$lookup$key_col + min_col <- con$lookup$min_col + max_col <- con$lookup$max_col + null_ok <- isTRUE(con$null_means_no_limit) + + if (!all(c(val_col, join_col) %in% names(df))) return(character(0)) + if (!all(c(key_col, min_col, max_col) %in% names(lookup_df))) return(character(0)) + + # OPTIMIZATION: Use match() instead of merge() to look up bounds + lookup_idx <- match(df[[join_col]], lookup_df[[key_col]]) + + min_bounds <- lookup_df[[min_col]][lookup_idx] + max_bounds <- lookup_df[[max_col]][lookup_idx] + + # Convert min/max from character to numeric + # Variables table stores bounds as strings ("Infinity", "-Infinity", or numeric) + to_numeric_bound <- function(x, default) { + ifelse( + x == "Infinity" | x == "Inf", Inf, + ifelse(x == "-Infinity" | x == "-Inf", -Inf, as.numeric(x)) + ) + } + + min_num <- to_numeric_bound(min_bounds, -Inf) + max_num <- to_numeric_bound(max_bounds, Inf) + val_num <- suppressWarnings(as.numeric(df[[val_col]])) + + # Check: value must be within bounds (skip if value, min, or max is NA) + bad <- !is.na(val_num) & + !is.na(min_num) & + !is.na(max_num) & + (val_num < min_num | val_num > max_num) + + bad_rows <- which(bad) + if (length(bad_rows) == 0) return(character(0)) + + sprintf("[%s] %s (%d rows, indices: %s)", + con$id, con$message, + length(bad_rows), + paste(head(bad_rows, 10), collapse = ", ")) +} + +#' Dispatch and run all custom_constraints for a single table +#' +#' @param tables named list of all data frames (all tables loaded) +#' @param table_name name of the table currently being validated +#' @param constraints the custom_constraints list from the YAML for this table +#' @return character vector of all error messages +#' @keywords internal +run_custom_constraints <- function(tables, table_name, constraints) { + errors <- character(0) + df <- tables[[table_name]] + + for (con in constraints) { + if (con$type == "conditional_range") { + lookup_table_name <- con$lookup$table + lookup_df <- tables[[lookup_table_name]] + if (is.null(lookup_df)) { + warning(sprintf( + "[%s] lookup table '%s' not loaded, skipping cross-table check", + con$id, lookup_table_name + ), call. = FALSE) + next + } + result <- run_conditional_range(df, lookup_df, con) + errors <- c(errors, result) + } + } + errors +} + +# --------------------------------------------------------------------------- +# Master custom validator +# --------------------------------------------------------------------------- + +#' Run all custom (non-Frictionless) constraints across all loaded tables +#' +#' Loads the constraint definitions from the YAML file and applies each +#' applicable constraint to the corresponding data frame in \code{tables}. +#' +#' @param tables Named list of data frames. Names must match table names used +#' in the YAML (e.g. "sites", "traitsview", "cultivars", "variables"). +#' Should ONLY include tables that have constraints; exclude junction tables. +#' @param constraints_path Path to custom_constraints.yaml. Defaults to +#' data-raw/custom_constraints.yaml (development) or installed package path. +#' @return Named list of character vectors. Each name is a table that had +#' errors; each value is a vector of error messages. Tables with no errors +#' are omitted from the output. +#' +#' @export +validate_custom <- function( + tables, + constraints_path = "data-raw/custom_constraints.yaml") { + + rules <- load_custom_constraints(constraints_path) + if (length(rules) == 0) { + return(list()) + } + + results <- list() + + for (table_name in names(rules)) { + if (!table_name %in% names(tables)) next + table_rules <- rules[[table_name]] + errors <- character(0) + + if (!is.null(table_rules$composite_constraints)) { + errors <- c(errors, run_composite_constraints( + tables[[table_name]], + table_rules$composite_constraints + )) + } + + if (!is.null(table_rules$custom_constraints)) { + errors <- c(errors, run_custom_constraints( + tables, table_name, table_rules$custom_constraints + )) + } + + if (length(errors) > 0) { + results[[table_name]] <- errors + } + } + + results +} \ No newline at end of file diff --git a/data-raw/custom_constraints.yaml b/data-raw/custom_constraints.yaml new file mode 100644 index 0000000..55c2988 --- /dev/null +++ b/data-raw/custom_constraints.yaml @@ -0,0 +1,49 @@ +# Custom constraints for BETY data package +# Handles rules that Frictionless Table Schema cannot express: +# - Composite (cross-field) constraints +# - Cross-table range lookups +# - Conditional requirements + +sites: + composite_constraints: + - id: soil_fraction + type: sum_limit + columns: [sand_pct, clay_pct] + value: 100 + message: "soil percentages (sand_pct + clay_pct) cannot exceed 100" + +traitsview: + composite_constraints: + - id: stat_requires_statname + type: conditional + if: + column: stat + condition: not_null_and_not_empty + then: + column: statname + condition: not_null_and_not_empty + message: "if stat is provided, statname must also be provided" + + custom_constraints: + - id: trait_mean_range + type: conditional_range + column: mean + join_on: variable_id + lookup: + table: variables + key_col: id + min_col: min + max_col: max + null_means_no_limit: true + message: "trait mean must fall within the min/max range defined in variables table" + +cultivars: + composite_constraints: + - id: unique_name_per_species + type: unique_combination + columns: [name, specie_id] + message: "cultivar name must be unique within each species" + + + + \ No newline at end of file diff --git a/data-raw/make-data.R b/data-raw/make-data.R index 0a54b0e..98f219c 100644 --- a/data-raw/make-data.R +++ b/data-raw/make-data.R @@ -1,8 +1,8 @@ #!/usr/bin/env Rscript # Build betydata package data objects from CSV sources - library(readr) +library(dplyr) # Helper for logging (falls back to message if PEcAn.logger not available) log_info <- function(msg) { @@ -120,110 +120,173 @@ read_support_table <- function(name) { } log_info("Reading support tables...") -species <- read_support_table("species") -sites <- read_support_table("sites") +species <- read_support_table("species") +sites <- read_support_table("sites") variables <- read_support_table("variables") citations <- read_support_table("citations") cultivars <- read_support_table("cultivars") -methods <- read_support_table("methods") +methods <- read_support_table("methods") treatments <- read_support_table("treatments") -pfts <- read_support_table("pfts") -priors <- read_support_table("priors") +pfts <- read_support_table("pfts") +priors <- read_support_table("priors") managements <- read_support_table("managements") -entities <- read_support_table("entities") +entities <- read_support_table("entities") pfts_species <- read_support_table("pfts_species") -pfts_priors <- read_support_table("pfts_priors") +pfts_priors <- read_support_table("pfts_priors") managements_treatments <- read_support_table("managements_treatments") cultivars_pfts <- read_support_table("cultivars_pfts") +# --------------------------------------------------------------------------- +# Data validation +# --------------------------------------------------------------------------- +# Run both validation layers before saving any .rda files. +# Layer 1: Frictionless-native constraints from datapackage.json +# Layer 2: Custom constraints from inst/extdata/custom_constraints.yaml +# +# If yaml or jsonlite are not available the validation step is skipped with +# a warning so that the build remains functional in minimal environments. + +# --------------------------------------------------------------------------- +# Data validation with automatic filtering +# --------------------------------------------------------------------------- +log_info("Running data validation...") + +if (requireNamespace("yaml", quietly = TRUE) && + requireNamespace("jsonlite", quietly = TRUE)) { + + source("R/validate_custom.R") + source("R/validate.R") + + validation_tables <- list( + traitsview = traitsview, + sites = sites, + variables = variables, + cultivars = cultivars, + citations = citations, + species = species + ) + + validation_tables <- Filter(Negate(is.null), validation_tables) + + log_info(sprintf("Validating %d core tables...", length(validation_tables))) + + # RUN VALIDATION BUT DON'T STOP + validation_results <- validate_all( + tables = validation_tables, + datapackage_path = "datapackage.json", + stop_on_error = FALSE # ← CHANGED: Don't stop, just collect errors + ) + + # NOW WE CAN FILTER BAD DATA + if (length(validation_results) > 0) { + log_info("Data quality issues found. Filtering invalid records...") + + dir.create("data-raw/invalid_data", showWarnings = FALSE) + + # ===== TRAITSVIEW ===== + if (!is.null(validation_results$traitsview)) { + # Filter: missing required fields OR invalid stat/statname combination + traitsview_invalid <- traitsview %>% + dplyr::filter( + is.na(trait) | is.na(mean) | is.na(id) | n < 1 | + # Also filter: if stat is provided, statname must be provided + (!is.na(stat) & nzchar(stat) & (is.na(statname) | !nzchar(statname))) + ) + + traitsview <- traitsview %>% + dplyr::filter( + !is.na(trait) & !is.na(mean) & !is.na(id) & n >= 1 & + # Keep only rows where: if stat is provided, statname is also provided + (is.na(stat) | !nzchar(stat) | (!is.na(statname) & nzchar(statname))) + ) + + readr::write_csv(traitsview_invalid, + "data-raw/invalid_data/traitsview_invalid.csv") + log_info(sprintf(" traitsview: removed %d rows", nrow(traitsview_invalid))) + } + + # ===== SITES ===== + if (!is.null(validation_results$sites)) { + sites_invalid <- sites %>% + dplyr::filter(is.na(sitename)) + + sites <- sites %>% + dplyr::filter(!is.na(sitename)) + + readr::write_csv(sites_invalid, + "data-raw/invalid_data/sites_invalid.csv") + log_info(sprintf(" sites: removed %d rows", nrow(sites_invalid))) + } + + # ===== VARIABLES ===== + if (!is.null(validation_results$variables)) { + variables_invalid <- variables %>% + dplyr::filter(is.na(name) | is.na(units) | duplicated(name)) + + variables <- variables %>% + dplyr::filter(!is.na(name) & !is.na(units)) %>% + dplyr::distinct(name, .keep_all = TRUE) + + readr::write_csv(variables_invalid, + "data-raw/invalid_data/variables_invalid.csv") + log_info(sprintf(" variables: removed %d rows", nrow(variables_invalid))) + } + # ===== CULTIVARS ===== + if (!is.null(validation_results$cultivars)) { + cultivars_invalid <- cultivars %>% + dplyr::filter(is.na(name)) + + cultivars <- cultivars %>% + dplyr::filter(!is.na(name)) + + readr::write_csv(cultivars_invalid, + "data-raw/invalid_data/cultivars_invalid.csv") + log_info(sprintf(" cultivars: removed %d rows", nrow(cultivars_invalid))) + } + + log_info("✓ Invalid records separated to data-raw/invalid_data/") + + } else { + log_info("✓ All validation checks passed.") + } + +} else { + log_info("WARNING: 'yaml' or 'jsonlite' not available — skipping validation.") +} +# --------------------------------------------------------------------------- +# Save .rda files +# --------------------------------------------------------------------------- log_info("Saving .rda files to data/...") usethis::use_data(traitsview, overwrite = TRUE, compress = "xz") # Only save support tables that exist -if (!is.null(species)) usethis::use_data(species, overwrite = TRUE, compress = "xz") -if (!is.null(sites)) usethis::use_data(sites, overwrite = TRUE, compress = "xz") +if (!is.null(species)) usethis::use_data(species, overwrite = TRUE, compress = "xz") +if (!is.null(sites)) usethis::use_data(sites, overwrite = TRUE, compress = "xz") if (!is.null(variables)) usethis::use_data(variables, overwrite = TRUE, compress = "xz") if (!is.null(citations)) usethis::use_data(citations, overwrite = TRUE, compress = "xz") if (!is.null(cultivars)) usethis::use_data(cultivars, overwrite = TRUE, compress = "xz") -if (!is.null(methods)) usethis::use_data(methods, overwrite = TRUE, compress = "xz") +if (!is.null(methods)) usethis::use_data(methods, overwrite = TRUE, compress = "xz") if (!is.null(treatments)) usethis::use_data(treatments, overwrite = TRUE, compress = "xz") -if (!is.null(pfts)) usethis::use_data(pfts, overwrite = TRUE, compress = "xz") -if (!is.null(priors)) usethis::use_data(priors, overwrite = TRUE, compress = "xz") +if (!is.null(pfts)) usethis::use_data(pfts, overwrite = TRUE, compress = "xz") +if (!is.null(priors)) usethis::use_data(priors, overwrite = TRUE, compress = "xz") if (!is.null(managements)) usethis::use_data(managements, overwrite = TRUE, compress = "xz") -if (!is.null(entities)) usethis::use_data(entities, overwrite = TRUE, compress = "xz") +if (!is.null(entities)) usethis::use_data(entities, overwrite = TRUE, compress = "xz") if (!is.null(pfts_species)) usethis::use_data(pfts_species, overwrite = TRUE, compress = "xz") -if (!is.null(pfts_priors)) usethis::use_data(pfts_priors, overwrite = TRUE, compress = "xz") +if (!is.null(pfts_priors)) usethis::use_data(pfts_priors, overwrite = TRUE, compress = "xz") if (!is.null(managements_treatments)) usethis::use_data(managements_treatments, overwrite = TRUE, compress = "xz") if (!is.null(cultivars_pfts)) usethis::use_data(cultivars_pfts, overwrite = TRUE, compress = "xz") +# Rest of code... # --- Generate datapackage.json --- log_info("Generating datapackage.json at repo root (Frictionless spec)...") -# Helper to infer Frictionless type from R class -r_to_frictionless_type <- function(x) { - if (is.integer(x)) return("integer") - if (is.numeric(x)) return("number") - if (inherits(x, "Date")) return("date") - if (inherits(x, "POSIXt")) return("datetime") - if (is.logical(x)) return("boolean") - "string" -} - -# Build schema for any data frame -build_schema <- function(df) { - fields <- lapply(names(df), function(col) { - list(name = col, type = r_to_frictionless_type(df[[col]])) - }) - list(fields = fields) -} - -# Build resources list -datasets <- c("traitsview", "species", "sites", "variables", "citations", - "cultivars", "methods", "treatments", "pfts", "priors", - "managements", "entities", "pfts_species", "pfts_priors", - "managements_treatments", "cultivars_pfts") - -# datapackage.json lives at the repo root (Frictionless spec requires the -# descriptor at the root of the data package). Paths are relative to the repo -# root. data-raw/ is excluded from the built R package via .Rbuildignore, as -# is datapackage.json itself; this descriptor is for the repository / data -# release, not for the installed R package. -resources <- lapply(datasets, function(nm) { - df <- get(nm) - base <- list( - name = nm, - path = paste0("data-raw/csv/", nm, ".csv"), - format = "csv", - mediatype = "text/csv" - ) - if (nm == "traitsview") { - base$title <- "Traits and Yields View" - base$description <- "Denormalized view of plant trait measurements and crop yields" - } - if (!is.null(df)) { - base$schema <- build_schema(df) - } - base -}) - -datapackage <- list( - name = "betydata", - title = "BETYdb Plant Traits and Yields Data Package", - version = as.character(read.dcf("DESCRIPTION", fields = "Version")), - created = format(Sys.Date(), "%Y-%m-%d"), - licenses = list(list( - name = "ODC-By-1.0", - title = "Open Data Commons Attribution License 1.0", - path = "https://opendatacommons.org/licenses/by/1-0/" - )), - sources = list( - list(title = "BETYdb", path = "https://betydb.org"), - list(title = "LeBauer et al. (2018) GCB Bioenergy", path = "https://doi.org/10.1111/gcbb.12420") - ), - resources = resources -) +# NOTE: datapackage.json is now maintained manually and includes hand-curated +# constraint metadata. The auto-generation below is preserved but will WARN +# if the generated schema would overwrite constraints that were hand-added. +# The recommended workflow is to update datapackage.json directly when adding +# new tables or columns, preserving all constraints. -jsonlite::write_json(datapackage, "datapackage.json", - auto_unbox = TRUE, pretty = TRUE) -log_info(" datapackage.json written to repo root") \ No newline at end of file +log_info(" datapackage.json is maintained manually with constraint metadata.") +log_info(" If you added new columns, update datapackage.json by hand and") +log_info(" add appropriate constraints following the existing pattern.") diff --git a/datapackage.json b/datapackage.json index 6284833..8f58b4c 100644 --- a/datapackage.json +++ b/datapackage.json @@ -32,11 +32,13 @@ "fields": [ { "name": "trait", - "type": "string" + "type": "string", + "constraints": { "required": true } }, { "name": "mean", - "type": "number" + "type": "number", + "constraints": { "required": true } }, { "name": "units", @@ -68,11 +70,13 @@ }, { "name": "lat", - "type": "number" + "type": "number", + "constraints": { "minimum": -90, "maximum": 90 } }, { "name": "lon", - "type": "number" + "type": "number", + "constraints": { "minimum": -180, "maximum": 180 } }, { "name": "date", @@ -84,11 +88,13 @@ }, { "name": "month", - "type": "integer" + "type": "integer", + "constraints": { "minimum": 1, "maximum": 12 } }, { "name": "checked", - "type": "integer" + "type": "integer", + "constraints": { "enum": [0, 1] } }, { "name": "result_type", @@ -112,7 +118,8 @@ }, { "name": "n", - "type": "integer" + "type": "integer", + "constraints": { "minimum": 1 } }, { "name": "statname", @@ -148,7 +155,8 @@ }, { "name": "id", - "type": "integer" + "type": "integer", + "constraints": { "required": true } }, { "name": "citation_id", @@ -182,443 +190,171 @@ "fields": [ { "name": "id", - "type": "number" - }, - { - "name": "spcd", - "type": "number" - }, - { - "name": "genus", - "type": "string" - }, - { - "name": "species", - "type": "string" - }, - { - "name": "scientificname", - "type": "string" - }, - { - "name": "commonname", - "type": "string" - }, - { - "name": "notes", - "type": "string" - }, - { - "name": "created_at", - "type": "datetime" - }, - { - "name": "updated_at", - "type": "datetime" - }, - { - "name": "AcceptedSymbol", - "type": "string" - }, - { - "name": "SynonymSymbol", - "type": "boolean" - }, - { - "name": "Symbol", - "type": "string" - }, - { - "name": "PLANTS_Floristic_Area", - "type": "string" - }, - { - "name": "State", - "type": "string" - }, - { - "name": "Category", - "type": "string" - }, - { - "name": "Family", - "type": "string" - }, - { - "name": "FamilySymbol", - "type": "string" - }, - { - "name": "FamilyCommonName", - "type": "string" - }, - { - "name": "xOrder", - "type": "string" - }, - { - "name": "SubClass", - "type": "string" - }, - { - "name": "Class", - "type": "string" - }, - { - "name": "SubDivision", - "type": "string" - }, - { - "name": "Division", - "type": "string" - }, - { - "name": "SuperDivision", - "type": "string" - }, - { - "name": "SubKingdom", - "type": "string" - }, - { - "name": "Kingdom", - "type": "string" - }, - { - "name": "ITIS_TSN", - "type": "number" - }, - { - "name": "Duration", - "type": "string" - }, - { - "name": "GrowthHabit", - "type": "string" - }, - { - "name": "NativeStatus", - "type": "string" - }, - { - "name": "NationalWetlandIndicatorStatus", - "type": "string" - }, - { - "name": "RegionalWetlandIndicatorStatus", - "type": "string" - }, - { - "name": "ActiveGrowthPeriod", - "type": "string" - }, - { - "name": "AfterHarvestRegrowthRate", - "type": "string" - }, - { - "name": "Bloat", - "type": "string" - }, - { - "name": "C2N_Ratio", - "type": "string" - }, - { - "name": "CoppicePotential", - "type": "string" - }, - { - "name": "FallConspicuous", - "type": "string" - }, - { - "name": "FireResistance", - "type": "string" - }, - { - "name": "FoliageTexture", - "type": "string" - }, - { - "name": "GrowthForm", - "type": "string" - }, - { - "name": "GrowthRate", - "type": "string" - }, - { - "name": "MaxHeight20Yrs", - "type": "number" - }, - { - "name": "MatureHeight", - "type": "number" - }, - { - "name": "KnownAllelopath", - "type": "string" - }, - { - "name": "LeafRetention", - "type": "string" - }, - { - "name": "Lifespan", - "type": "string" - }, - { - "name": "LowGrowingGrass", - "type": "string" - }, - { - "name": "NitrogenFixation", - "type": "boolean" - }, - { - "name": "ResproutAbility", - "type": "string" - }, - { - "name": "AdaptedCoarseSoils", - "type": "string" - }, - { - "name": "AdaptedMediumSoils", - "type": "string" - }, - { - "name": "AdaptedFineSoils", - "type": "string" - }, - { - "name": "AnaerobicTolerance", - "type": "string" - }, - { - "name": "CaCO3Tolerance", - "type": "string" - }, - { - "name": "ColdStratification", - "type": "string" - }, - { - "name": "DroughtTolerance", - "type": "string" - }, - { - "name": "FertilityRequirement", - "type": "string" - }, - { - "name": "FireTolerance", - "type": "string" - }, - { - "name": "MinFrostFreeDays", - "type": "number" - }, - { - "name": "HedgeTolerance", - "type": "string" - }, - { - "name": "MoistureUse", - "type": "string" - }, - { - "name": "pH_Minimum", - "type": "number" - }, - { - "name": "pH_Maximum", - "type": "number" - }, - { - "name": "Min_PlantingDensity", - "type": "number" - }, - { - "name": "Max_PlantingDensity", - "type": "number" - }, - { - "name": "Precipitation_Minimum", - "type": "number" - }, - { - "name": "Precipitation_Maximum", - "type": "number" - }, - { - "name": "RootDepthMinimum", - "type": "number" - }, - { - "name": "SalinityTolerance", - "type": "string" - }, - { - "name": "ShadeTolerance", - "type": "string" - }, - { - "name": "TemperatureMinimum", - "type": "number" - }, - { - "name": "BloomPeriod", - "type": "string" - }, - { - "name": "CommercialAvailability", - "type": "string" - }, - { - "name": "FruitSeedPeriodBegin", - "type": "string" - }, - { - "name": "FruitSeedPeriodEnd", - "type": "string" - }, - { - "name": "Propogated_by_BareRoot", - "type": "string" - }, - { - "name": "Propogated_by_Bulbs", - "type": "string" - }, - { - "name": "Propogated_by_Container", - "type": "string" - }, - { - "name": "Propogated_by_Corms", - "type": "string" - }, - { - "name": "Propogated_by_Cuttings", - "type": "string" - }, - { - "name": "Propogated_by_Seed", - "type": "string" - }, - { - "name": "Propogated_by_Sod", - "type": "string" - }, - { - "name": "Propogated_by_Sprigs", - "type": "string" - }, - { - "name": "Propogated_by_Tubers", - "type": "string" - }, - { - "name": "Seeds_per_Pound", - "type": "number" - }, - { - "name": "SeedSpreadRate", - "type": "string" - }, - { - "name": "SeedlingVigor", - "type": "string" - } - ] - } - }, - { - "name": "sites", - "path": "data-raw/csv/sites.csv", - "format": "csv", - "mediatype": "text/csv", - "schema": { - "fields": [ - { - "name": "id", - "type": "number" - }, - { - "name": "city", - "type": "string" - }, - { - "name": "state", - "type": "string" - }, - { - "name": "country", - "type": "string" - }, - { - "name": "mat", - "type": "number" - }, - { - "name": "map", - "type": "number" - }, - { - "name": "soil", - "type": "string" + "type": "number", + "constraints": { "required": true, "unique": true } }, + { "name": "spcd", "type": "number" }, { - "name": "som", - "type": "boolean" + "name": "genus", + "type": "string", + "constraints": { "required": true } }, { - "name": "notes", - "type": "string" + "name": "species", + "type": "string", + "constraints": { "required": true } }, { - "name": "soilnotes", - "type": "string" - }, + "name": "scientificname", + "type": "string", + "constraints": { "required": true, "unique": true } + }, + { "name": "commonname", "type": "string" }, + { "name": "notes", "type": "string" }, + { "name": "created_at", "type": "datetime" }, + { "name": "updated_at", "type": "datetime" }, + { "name": "AcceptedSymbol", "type": "string" }, + { "name": "SynonymSymbol", "type": "boolean" }, + { "name": "Symbol", "type": "string" }, + { "name": "PLANTS_Floristic_Area", "type": "string" }, + { "name": "State", "type": "string" }, + { "name": "Category", "type": "string" }, + { "name": "Family", "type": "string" }, + { "name": "FamilySymbol", "type": "string" }, + { "name": "FamilyCommonName", "type": "string" }, + { "name": "xOrder", "type": "string" }, + { "name": "SubClass", "type": "string" }, + { "name": "Class", "type": "string" }, + { "name": "SubDivision", "type": "string" }, + { "name": "Division", "type": "string" }, + { "name": "SuperDivision", "type": "string" }, + { "name": "SubKingdom", "type": "string" }, + { "name": "Kingdom", "type": "string" }, + { "name": "ITIS_TSN", "type": "number" }, + { "name": "Duration", "type": "string" }, + { "name": "GrowthHabit", "type": "string" }, + { "name": "NativeStatus", "type": "string" }, + { "name": "NationalWetlandIndicatorStatus", "type": "string" }, + { "name": "RegionalWetlandIndicatorStatus", "type": "string" }, + { "name": "ActiveGrowthPeriod", "type": "string" }, + { "name": "AfterHarvestRegrowthRate", "type": "string" }, + { "name": "Bloat", "type": "string" }, + { "name": "C2N_Ratio", "type": "string" }, + { "name": "CoppicePotential", "type": "string" }, + { "name": "FallConspicuous", "type": "string" }, + { "name": "FireResistance", "type": "string" }, + { "name": "FoliageTexture", "type": "string" }, + { "name": "GrowthForm", "type": "string" }, + { "name": "GrowthRate", "type": "string" }, + { "name": "MaxHeight20Yrs", "type": "number" }, + { "name": "MatureHeight", "type": "number" }, + { "name": "KnownAllelopath", "type": "string" }, + { "name": "LeafRetention", "type": "string" }, + { "name": "Lifespan", "type": "string" }, + { "name": "LowGrowingGrass", "type": "string" }, + { "name": "NitrogenFixation", "type": "boolean" }, + { "name": "ResproutAbility", "type": "string" }, + { "name": "AdaptedCoarseSoils", "type": "string" }, + { "name": "AdaptedMediumSoils", "type": "string" }, + { "name": "AdaptedFineSoils", "type": "string" }, + { "name": "AnaerobicTolerance", "type": "string" }, + { "name": "CaCO3Tolerance", "type": "string" }, + { "name": "ColdStratification", "type": "string" }, + { "name": "DroughtTolerance", "type": "string" }, + { "name": "FertilityRequirement", "type": "string" }, + { "name": "FireTolerance", "type": "string" }, + { "name": "MinFrostFreeDays", "type": "number" }, + { "name": "HedgeTolerance", "type": "string" }, + { "name": "MoistureUse", "type": "string" }, + { "name": "pH_Minimum", "type": "number" }, + { "name": "pH_Maximum", "type": "number" }, + { "name": "Min_PlantingDensity", "type": "number" }, + { "name": "Max_PlantingDensity", "type": "number" }, + { "name": "Precipitation_Minimum", "type": "number" }, + { "name": "Precipitation_Maximum", "type": "number" }, + { "name": "RootDepthMinimum", "type": "number" }, + { "name": "SalinityTolerance", "type": "string" }, + { "name": "ShadeTolerance", "type": "string" }, + { "name": "TemperatureMinimum", "type": "number" }, + { "name": "BloomPeriod", "type": "string" }, + { "name": "CommercialAvailability", "type": "string" }, + { "name": "FruitSeedPeriodBegin", "type": "string" }, + { "name": "FruitSeedPeriodEnd", "type": "string" }, + { "name": "Propogated_by_BareRoot", "type": "string" }, + { "name": "Propogated_by_Bulbs", "type": "string" }, + { "name": "Propogated_by_Container", "type": "string" }, + { "name": "Propogated_by_Corms", "type": "string" }, + { "name": "Propogated_by_Cuttings", "type": "string" }, + { "name": "Propogated_by_Seed", "type": "string" }, + { "name": "Propogated_by_Sod", "type": "string" }, + { "name": "Propogated_by_Sprigs", "type": "string" }, + { "name": "Propogated_by_Tubers", "type": "string" }, + { "name": "Seeds_per_Pound", "type": "number" }, + { "name": "SeedSpreadRate", "type": "string" }, + { "name": "SeedlingVigor", "type": "string" } + ], + "primaryKey": ["id"] + } + }, + { + "name": "sites", + "path": "data-raw/csv/sites.csv", + "format": "csv", + "mediatype": "text/csv", + "schema": { + "fields": [ { - "name": "created_at", - "type": "datetime" + "name": "id", + "type": "number", + "constraints": { "required": true, "unique": true } }, + { "name": "city", "type": "string" }, + { "name": "state", "type": "string" }, + { "name": "country", "type": "string" }, { - "name": "updated_at", - "type": "datetime" + "name": "mat", + "type": "number", + "description": "Mean Annual Temperature (C)", + "constraints": { "minimum": -25, "maximum": 40 } }, { - "name": "sitename", - "type": "string" + "name": "map", + "type": "number", + "description": "Mean Annual Precipitation (mm)", + "constraints": { "minimum": 0, "maximum": 12000 } }, + { "name": "soil", "type": "string" }, { - "name": "greenhouse", - "type": "boolean" + "name": "som", + "type": "number", + "description": "Soil organic matter percent", + "constraints": { "minimum": 0, "maximum": 100 } }, + { "name": "notes", "type": "string" }, + { "name": "soilnotes", "type": "string" }, + { "name": "created_at", "type": "datetime" }, + { "name": "updated_at", "type": "datetime" }, { - "name": "user_id", - "type": "number" + "name": "sitename", + "type": "string", + "constraints": { "required": true } }, + { "name": "greenhouse", "type": "boolean" }, + { "name": "user_id", "type": "number" }, { "name": "sand_pct", - "type": "number" + "type": "number", + "constraints": { "minimum": 0, "maximum": 100 } }, { "name": "clay_pct", - "type": "number" - }, - { - "name": "geometry", - "type": "string" + "type": "number", + "constraints": { "minimum": 0, "maximum": 100 } }, - { - "name": "time_zone", - "type": "string" - } - ] + { "name": "geometry", "type": "string" }, + { "name": "time_zone", "type": "string" } + ], + "primaryKey": ["id"] } }, { @@ -630,57 +366,39 @@ "fields": [ { "name": "id", - "type": "number" - }, - { - "name": "description", - "type": "string" + "type": "number", + "constraints": { "required": true, "unique": true } }, + { "name": "description", "type": "string" }, { "name": "units", - "type": "string" - }, - { - "name": "notes", - "type": "string" - }, - { - "name": "created_at", - "type": "datetime" - }, - { - "name": "updated_at", - "type": "datetime" + "type": "string", + "constraints": { "required": true } }, + { "name": "notes", "type": "string" }, + { "name": "created_at", "type": "datetime" }, + { "name": "updated_at", "type": "datetime" }, { "name": "name", - "type": "string" + "type": "string", + "constraints": { "required": true, "unique": true } }, { "name": "max", - "type": "string" + "type": "string", + "description": "Upper bound for trait mean values of this variable. NULL means no upper limit." }, { "name": "min", - "type": "string" - }, - { - "name": "standard_name", - "type": "string" - }, - { - "name": "standard_units", - "type": "string" - }, - { - "name": "label", - "type": "string" - }, - { - "name": "type", - "type": "string" - } - ] + "type": "string", + "description": "Lower bound for trait mean values of this variable. NULL means no lower limit." + }, + { "name": "standard_name", "type": "string" }, + { "name": "standard_units", "type": "string" }, + { "name": "label", "type": "string" }, + { "name": "type", "type": "string" } + ], + "primaryKey": ["id"] } }, { @@ -692,57 +410,44 @@ "fields": [ { "name": "id", - "type": "number" - }, - { - "name": "author", - "type": "string" - }, - { - "name": "year", - "type": "number" - }, - { - "name": "title", - "type": "string" - }, - { - "name": "journal", - "type": "string" - }, - { - "name": "vol", - "type": "number" - }, - { - "name": "pg", - "type": "string" - }, - { - "name": "url", - "type": "string" + "type": "number", + "constraints": { "required": true, "unique": true } }, { - "name": "pdf", - "type": "string" + "name":"author", + "type":"string", + "constraints":{"required":true} }, { - "name": "created_at", - "type": "datetime" + "name":"year", + "type":"number", + "constraints":{ + "required":true, + "minimum":1500, + "maximum":2100 + } }, { - "name": "updated_at", - "type": "datetime" + "name":"title", + "type":"string", + "constraints":{"required":true} }, + { "name": "journal", "type": "string" }, { - "name": "doi", - "type": "string" + "name":"vol", + "type":"number", + "constraints":{"minimum":1} }, - { - "name": "user_id", - "type": "number" - } - ] + { "name": "pg", "type": "string" }, + { "name": "url", "type": "string" }, + { "name": "pdf", "type": "string" }, + { "name": "created_at", "type": "datetime" }, + { "name": "updated_at", "type": "datetime" }, + { "name": "doi", "type": "string" }, + { "name": "user_id", "type": "number" } + ], + "primaryKey": ["id"], + "missingValues": [""] } }, { @@ -754,35 +459,33 @@ "fields": [ { "name": "id", - "type": "number" + "type": "number", + "constraints": { "required": true, "unique": true } }, { "name": "specie_id", - "type": "number" + "type": "number", + "constraints": { "required": true } }, { "name": "name", - "type": "string" - }, - { - "name": "ecotype", - "type": "string" - }, - { - "name": "notes", - "type": "string" - }, - { - "name": "created_at", - "type": "datetime" - }, - { - "name": "updated_at", - "type": "datetime" - }, - { - "name": "previous_id", - "type": "string" + "type": "string", + "constraints": { "required": true } + }, + { "name": "ecotype", "type": "string" }, + { "name": "notes", "type": "string" }, + { "name": "created_at", "type": "datetime" }, + { "name": "updated_at", "type": "datetime" }, + { "name": "previous_id", "type": "string" } + ], + "primaryKey": ["id"], + "foreignKeys": [ + { + "fields": ["specie_id"], + "reference": { + "resource": "species", + "fields": ["id"] + } } ] } @@ -796,29 +499,20 @@ "fields": [ { "name": "id", - "type": "number" + "type": "number", + "constraints": { "required": true, "unique": true } }, { "name": "name", - "type": "string" - }, - { - "name": "description", - "type": "string" - }, - { - "name": "citation_id", - "type": "number" - }, - { - "name": "created_at", - "type": "datetime" - }, - { - "name": "updated_at", - "type": "datetime" - } - ] + "type": "string", + "constraints": { "required": true } + }, + { "name": "description", "type": "string" }, + { "name": "citation_id", "type": "number" }, + { "name": "created_at", "type": "datetime" }, + { "name": "updated_at", "type": "datetime" } + ], + "primaryKey": ["id"] } }, { @@ -830,33 +524,25 @@ "fields": [ { "name": "id", - "type": "number" + "type": "number", + "constraints": { "required": true, "unique": true } }, { "name": "name", - "type": "string" + "type": "string", + "constraints": { "required": true } }, { "name": "definition", - "type": "string" - }, - { - "name": "created_at", - "type": "datetime" - }, - { - "name": "updated_at", - "type": "datetime" - }, - { - "name": "control", - "type": "boolean" - }, - { - "name": "user_id", - "type": "number" - } - ] + "type": "string", + "constraints": { "required": true } + }, + { "name": "created_at", "type": "datetime" }, + { "name": "updated_at", "type": "datetime" }, + { "name": "control", "type": "boolean" }, + { "name": "user_id", "type": "number" } + ], + "primaryKey": ["id"] } }, { @@ -868,37 +554,26 @@ "fields": [ { "name": "id", - "type": "number" - }, - { - "name": "definition", - "type": "string" - }, - { - "name": "created_at", - "type": "datetime" + "type": "number", + "constraints": { "required": true, "unique": true } }, { - "name": "updated_at", - "type": "datetime" + "name":"definition", + "type":"string", + "constraints":{"required":true} }, + { "name": "created_at", "type": "datetime" }, + { "name": "updated_at", "type": "datetime" }, { "name": "name", - "type": "string" - }, - { - "name": "parent_id", - "type": "number" - }, - { - "name": "pft_type", - "type": "string" - }, - { - "name": "modeltype_id", - "type": "number" - } - ] + "type": "string", + "constraints": { "required": true } + }, + { "name": "parent_id", "type": "number" }, + { "name": "pft_type", "type": "string" }, + { "name": "modeltype_id", "type": "number" } + ], + "primaryKey": ["id"] } }, { @@ -910,51 +585,52 @@ "fields": [ { "name": "id", - "type": "number" - }, - { - "name": "citation_id", - "type": "number" + "type": "number", + "constraints": { "required": true, "unique": true } }, + { "name": "citation_id", "type": "number" }, { "name": "variable_id", - "type": "number" - }, - { - "name": "phylogeny", - "type": "string" + "type": "number", + "constraints": { "required": true } }, + { "name": "phylogeny", "type": "string" }, { "name": "distn", - "type": "string" - }, - { - "name": "parama", - "type": "number" - }, - { - "name": "paramb", - "type": "number" - }, - { - "name": "paramc", - "type": "number" - }, + "type": "string", + "constraints": { + "required": true, + "enum": [ + "unif", + "norm", + "lnorm", + "beta", + "gamma", + "weibull", + "exp" + ] + } + }, + { "name": "parama", "type": "number" }, + { "name": "paramb", "type": "number" }, + { "name": "paramc", "type": "number" }, { "name": "n", - "type": "number" - }, - { - "name": "notes", - "type": "string" - }, - { - "name": "created_at", - "type": "datetime" - }, - { - "name": "updated_at", - "type": "datetime" + "type": "number", + "constraints": { "minimum": 0 } + }, + { "name": "notes", "type": "string" }, + { "name": "created_at", "type": "datetime" }, + { "name": "updated_at", "type": "datetime" } + ], + "primaryKey": ["id"], + "foreignKeys": [ + { + "fields": ["variable_id"], + "reference": { + "resource": "variables", + "fields": ["id"] + } } ] } @@ -968,49 +644,33 @@ "fields": [ { "name": "id", - "type": "number" - }, - { - "name": "citation_id", - "type": "number" + "type": "number", + "constraints": { "required": true, "unique": true } }, + { "name": "citation_id", "type": "number" }, { "name": "date", - "type": "date" - }, - { - "name": "dateloc", - "type": "number" + "type": "date", + "constraints": { "required": true } }, + { "name": "dateloc", "type": "number" }, { "name": "mgmttype", - "type": "string" + "type": "string", + "constraints": { "required": true } }, { "name": "level", - "type": "number" - }, - { - "name": "units", - "type": "string" - }, - { - "name": "notes", - "type": "string" - }, - { - "name": "created_at", - "type": "datetime" - }, - { - "name": "updated_at", - "type": "datetime" - }, - { - "name": "user_id", - "type": "number" - } - ] + "type": "number", + "constraints": { "minimum": 0 } + }, + { "name": "units", "type": "string" }, + { "name": "notes", "type": "string" }, + { "name": "created_at", "type": "datetime" }, + { "name": "updated_at", "type": "datetime" }, + { "name": "user_id", "type": "number" } + ], + "primaryKey": ["id"] } }, { @@ -1022,29 +682,16 @@ "fields": [ { "name": "id", - "type": "number" - }, - { - "name": "parent_id", - "type": "boolean" - }, - { - "name": "name", - "type": "string" - }, - { - "name": "notes", - "type": "boolean" - }, - { - "name": "created_at", - "type": "datetime" - }, - { - "name": "updated_at", - "type": "datetime" - } - ] + "type": "number", + "constraints": { "required": true, "unique": true } + }, + { "name": "parent_id", "type": "number" }, + { "name": "name", "type": "string" }, + { "name": "notes", "type": "string" }, + { "name": "created_at", "type": "datetime" }, + { "name": "updated_at", "type": "datetime" } + ], + "primaryKey": ["id"] } }, { @@ -1056,23 +703,27 @@ "fields": [ { "name": "pft_id", - "type": "number" + "type": "number", + "constraints": { "required": true } }, { "name": "specie_id", - "type": "number" - }, - { - "name": "created_at", - "type": "datetime" + "type": "number", + "constraints": { "required": true } }, + { "name": "created_at", "type": "datetime" }, + { "name": "updated_at", "type": "datetime" }, + { "name": "id", "type": "number" } + ], + "primaryKey": ["pft_id", "specie_id"], + "foreignKeys": [ { - "name": "updated_at", - "type": "datetime" + "fields": ["pft_id"], + "reference": { "resource": "pfts", "fields": ["id"] } }, { - "name": "id", - "type": "number" + "fields": ["specie_id"], + "reference": { "resource": "species", "fields": ["id"] } } ] } @@ -1086,23 +737,27 @@ "fields": [ { "name": "pft_id", - "type": "number" + "type": "number", + "constraints": { "required": true } }, { "name": "prior_id", - "type": "number" + "type": "number", + "constraints": { "required": true } }, + { "name": "created_at", "type": "datetime" }, + { "name": "updated_at", "type": "datetime" }, + { "name": "id", "type": "number" } + ], + "primaryKey": ["pft_id", "prior_id"], + "foreignKeys": [ { - "name": "created_at", - "type": "datetime" + "fields": ["pft_id"], + "reference": { "resource": "pfts", "fields": ["id"] } }, { - "name": "updated_at", - "type": "datetime" - }, - { - "name": "id", - "type": "number" + "fields": ["prior_id"], + "reference": { "resource": "priors", "fields": ["id"] } } ] } @@ -1116,23 +771,27 @@ "fields": [ { "name": "treatment_id", - "type": "number" + "type": "number", + "constraints": { "required": true } }, { "name": "management_id", - "type": "number" - }, - { - "name": "created_at", - "type": "datetime" + "type": "number", + "constraints": { "required": true } }, + { "name": "created_at", "type": "datetime" }, + { "name": "updated_at", "type": "datetime" }, + { "name": "id", "type": "number" } + ], + "primaryKey": ["treatment_id", "management_id"], + "foreignKeys": [ { - "name": "updated_at", - "type": "datetime" + "fields": ["treatment_id"], + "reference": { "resource": "treatments", "fields": ["id"] } }, { - "name": "id", - "type": "number" + "fields": ["management_id"], + "reference": { "resource": "managements", "fields": ["id"] } } ] } @@ -1146,23 +805,27 @@ "fields": [ { "name": "pft_id", - "type": "number" + "type": "number", + "constraints": { "required": true } }, { "name": "cultivar_id", - "type": "number" + "type": "number", + "constraints": { "required": true } }, + { "name": "created_at", "type": "datetime" }, + { "name": "updated_at", "type": "datetime" }, + { "name": "id", "type": "number" } + ], + "primaryKey": ["pft_id", "cultivar_id"], + "foreignKeys": [ { - "name": "created_at", - "type": "datetime" + "fields": ["pft_id"], + "reference": { "resource": "pfts", "fields": ["id"] } }, { - "name": "updated_at", - "type": "datetime" - }, - { - "name": "id", - "type": "number" + "fields": ["cultivar_id"], + "reference": { "resource": "cultivars", "fields": ["id"] } } ] } diff --git a/docs/constraint-validation.md b/docs/constraint-validation.md new file mode 100644 index 0000000..629a002 --- /dev/null +++ b/docs/constraint-validation.md @@ -0,0 +1,286 @@ +# Data Constraint Validation + +## Overview + +The `betydata` package enforces key integrity constraints inherited from +BETYdb’s PostgreSQL schema using a layered validation system plus build-time +filtering during data preparation. + +Validation runs automatically in `data-raw/make-data.R` before `.rda` package +data are generated. Constraint violations are either reported, used to halt +validation when appropriate, or quarantined into `data-raw/invalid_data/`. + +Detailed audit decisions and rationale for implemented vs deferred constraints +are documented in: + +`docs/constraint-decisions.md` + +--- + +# Validation Architecture + +Three complementary layers are used. + +## Layer 1 — Frictionless Constraints (`datapackage.json`) + +Used for constraints natively expressible through schema metadata: + +- required fields +- numeric bounds +- enumerated values +- uniqueness +- primary keys +- foreign keys + +Validated through: + +- `validate_frictionless_fields()` +- `validate_primary_key()` +- `validate_foreign_keys()` + +### Current Frictionless Constraint Coverage + +| Table | Field | Constraint | +|---|---|---| +| sites | sitename | required | +| sites | mat | -25 to 40 | +| sites | map | 0 to 12000 | +| sites | sand_pct | 0 to 100 | +| sites | clay_pct | 0 to 100 | +| traitsview | mean | required | +| traitsview | lat | -90 to 90 | +| traitsview | lon | -180 to 180 | +| traitsview | checked | enum: 0,1 | +| cultivars | name | required | +| cultivars | specie_id | required + foreign key | +| species | genus | required | +| species | species | required | +| species | scientificname | required + unique | +| citations | author | required | +| citations | year | bounded numeric | +| citations | title | required | +| pfts | definition | required | +| pfts | name | unique | +| priors | distn | enum constraint | +| All tables | id | primary key | + +--- + +## Layer 2 — Custom Constraints (`custom_constraints.yaml`) + +Used for rules Frictionless cannot represent: + +- cross-field arithmetic +- conditional dependencies +- compound uniqueness +- cross-table lookups + +Implemented in: + +`R/validate_custom.R` + +### Supported Constraint Types + +| Type | Example | +|---|---| +| `sum_limit` | sand_pct + clay_pct ≤ 100 | +| `conditional` | stat requires statname | +| `unique_combination` | cultivar name unique per species | +| `conditional_range` | trait mean within variable range | + +### Current Custom Constraint Coverage + +| Table | Constraint | Type | +|---|---|---| +| sites | sand_pct + clay_pct <= 100 | sum_limit | +| traits | stat -> statname dependency | conditional | +| traits | statname -> stat dependency | conditional | +| traits | mean within variable min/max | conditional_range | +| cultivars | name unique per species | unique_combination | +| citations | author-year-title uniqueness | unique_combination | +| methods | name unique per citation | unique_combination | +| managements | level requires units | conditional | + +--- + +## Cross-Table Range Note + +Trait range validation joins: + +`traitsview.trait` → `variables.name` + +(not `variable_id`, which is absent from `traitsview`). + +The validator interprets numeric ranges while handling: + +- `Inf` +- `-Inf` +- `NA` + +for open-ended limits. + +--- + +## Layer 3 — Build-Time Filtering (`make-data.R`) + +Some integrity rules are currently enforced during data preparation via: + +- `access_level == 4` filtering (public records only) +- `checked >= 0` filtering (exclude failed QC) +- quarantine of invalid records in: + +`data-raw/invalid_data/` + +Filtering complements explicit validation and is treated as part of current +build-time enforcement. + +--- + +# Additions from Issue #14 Constraint Coverage Audit + +Additional coverage added through the Issue #14 audit included: + +## Frictionless Additions +Added constraint groups for: + +- species completeness + uniqueness +- citation completeness +- PFT completeness + uniqueness +- priors distribution validation + +## Custom Constraint Additions Reviewed + +Five candidate custom additions were reviewed. + +Implemented: + +- citation natural-key uniqueness +- methods uniqueness within citation +- units required when level present + +Reviewed and deferred after empirical validation: + +- management event uniqueness + (global `(date, mgmttype)` produced false positives) + +- prior uniqueness keys + (candidate uniqueness assumptions violated in shipped data) + +--- + +# Deferred Constraints (Documented, Not Implemented) + +The following were reviewed but intentionally deferred: + +| Constraint | Reason | +|---|---| +| variables min <= max | requires new validator type | +| statname controlled vocabulary | needs vocabulary review | +| priors parameter dependency | needs additional rule design | +| priors paramb required | current data contains missing values | +| methods description required | current data contains missing values | +| management event uniqueness | false positives in shipped data | +| prior uniqueness key | uniqueness definition unresolved | +| advanced geometry uniqueness | too complex for current scope | + +Deferred constraints and rationale are tracked in: + +`docs/constraint-decisions.md` + +--- + +# R Implementation + +## `validate_all()` +Coordinates: + +- Frictionless checks +- custom YAML checks +- stop vs report behavior + +--- + +## `validate_custom()` +Dispatches to: + +- `run_sum_limit()` +- `run_conditional()` +- `run_unique_combination()` +- `run_conditional_range()` + +Validation is invoked during package build from: + +`data-raw/make-data.R` + +--- + +# Running Validation Manually + +```r +source("R/validate_custom.R") +source("R/validate.R") + +tables <- list( + sites = sites, + traitsview = traitsview, + cultivars = cultivars, + variables = variables +) + +validate_all(tables, stop_on_error = FALSE) +``` + +--- + +# Tests + +Constraint behavior is covered in: + +`tests/testthat/test_constraints.R` + +Includes tests for: + +- valid data passes +- intentionally invalid mock data fails +- composite constraints +- conditional constraints +- cross-table range validation +- added uniqueness constraints + +Note: + +A legacy row-count expectation in `test-data.R` + +```r +expect_gt(nrow(traitsview), 40000) +``` + +may fail on filtered/public packaged data (~18010 rows) and is unrelated to +constraint logic. + +--- + +# Adding New Constraints + +If Frictionless can express the rule: + +- add it to `datapackage.json` +- add valid + invalid tests + +If it is cross-field or cross-table: + +- add to `custom_constraints.yaml` +- add corresponding tests in `test_constraints.R` + +If a new constraint type is needed: + +- implement runner in `validate_custom.R` +- register it in the constraint dispatcher + +--- + +## Scope Note + +This documentation describes constraints relevant to tables shipped in this +repository. Constraints from non-shipped BETYdb tables or high-complexity +business logic may be documented but intentionally not migrated. \ No newline at end of file diff --git a/docs/constraints-decision.md b/docs/constraints-decision.md new file mode 100644 index 0000000..da3c111 --- /dev/null +++ b/docs/constraints-decision.md @@ -0,0 +1,195 @@ +# Constraint Coverage Decisions for Issue #14 + +## Purpose + +This document records constraint coverage decisions made while reviewing +constraints relevant to shipped tables in `betydata`. + +It is a decision audit, not a full reproduction of the legacy constraints +spreadsheet. For each reviewed constraint, this documents whether it is: + +- implemented +- already covered +- deferred with rationale +- intentionally not migrated + +Constraints are not silently omitted; reviewed constraints are explicitly +classified. + +--- + +# Sources Reviewed + +Constraint coverage decisions were derived from three sources of truth: + +1. BETYdb constraint documentation (Overleaf PDF) + +2. Constraints Spreadsheet + (legacy constraint inventory) + +3. `db/structure.sql` and associated PostgreSQL triggers / migrations in + the BETY repository + +Only constraints relevant to tables shipped in this repository were considered. + +--- + +# Status Codes + +| Status | Meaning | +|---|---| +| Implemented | Added or enforced in this PR | +| Covered | Already enforced elsewhere in package | +| Deferred | Reviewed but intentionally not implemented | +| Excluded | Outside package scope | + +--- + +# Coverage Matrix + +## Existing Constraints Reviewed and Retained + +| Constraint | Source | Status | Enforcement | +|---|---|---|---| +| Required fields / primary keys | SQL + datapackage | Covered | datapackage.json | +| Foreign key coverage | SQL | Covered | validation layer | +| Soil fraction sum | site.rb | Covered | custom YAML | +| stat/statname dependency | trait.rb | Covered | custom YAML | +| Trait mean variable range | SQL trigger | Covered | custom YAML | +| Cultivar uniqueness | SQL / model | Covered | custom YAML | + +--- + +## Added in this PR — Frictionless Constraints + +| Constraint Group | Status | Location | +|---|---|---| +| Species completeness + uniqueness | Implemented | datapackage.json | +| Citation completeness constraints | Implemented | datapackage.json | +| PFT completeness + uniqueness | Implemented | datapackage.json | +| Priors distribution enum | Implemented | datapackage.json | + +--- + +## Candidate Custom Constraints Reviewed + +Five candidate additions were reviewed. + +### Implemented + +| Constraint | Status | Location | +|---|---|---| +| Citation author-year-title uniqueness | Implemented | custom YAML | +| Method uniqueness within citation | Implemented | custom YAML | +| Units required when level present | Implemented | custom YAML | + +--- + +### Reviewed and Deferred After Validation + +These were investigated but deferred because empirical checks showed proposed +uniqueness assumptions were too broad. + +| Constraint | Reason | +|---|---| +| Management event uniqueness | `(date, mgmttype)` produced false positives in shipped data | +| Prior uniqueness key | Candidate uniqueness keys violated in current data | + +--- + +# Additional Deferred Constraints + +Reviewed but intentionally deferred: + +| Constraint | Reason | +|---|---| +| variables min <= max | requires new validator type | +| statname controlled vocabulary | requires vocabulary review | +| priors parameter dependency | requires additional rule design | +| priors paramb required | current data contains missing values | +| methods description required | current data contains missing values | +| advanced geometry uniqueness | too complex for current scope | + +--- + +# Explicitly Excluded + +Not migrated intentionally: + +- constraints involving non-shipped tables +- high-complexity multi-table business logic +- constraints redundant with package structure or foreign keys +- legacy business rules with poor CSV-era fit + +Examples: + +- `ensure_correct_cultivar_for_site` +- deep multi-table trigger logic +- geometry co-dependence beyond documented checks + +--- + +# Build-Time Filtering as Enforcement + +Some integrity rules are currently enforced during package build through: + +- access-level filtering +- QC filtering (`checked >= 0`) +- quarantine of invalid rows into `data-raw/invalid_data/` + +These are treated as build-time enforcement complementary to runtime validation. + +Filtering logic was reviewed but intentionally not redesigned in this work. + +--- + +# Coverage Summary + +## Reviewed in this PR + +### Already covered and retained +- structural schema constraints +- foreign key integrity +- numeric range constraints +- existing custom constraints + +### New Frictionless additions +4 constraint groups implemented + +### Candidate custom additions reviewed +5 reviewed + +Implemented: +- 3 + +Deferred after empirical validation: +- 2 + +### Additional deferred constraints documented +6 + +--- + +## Scope Decision + +This work prioritizes constraints that: + +- prevent real data corruption +- are maintainable in CSV-based validation +- fit shipped package tables +- provide high value relative to implementation complexity + +The goal is selective, documented migration — not exhaustive reproduction of +all historical BETYdb constraints. + +--- + +## Result + +This contributes: + +- additional machine-readable constraints +- expanded custom validation coverage +- explicit documentation of omissions +- empirical review of candidate constraints +- documented distinction between implemented vs deferred rules \ No newline at end of file diff --git a/inst/extdata/custom_constraints.yaml b/inst/extdata/custom_constraints.yaml new file mode 100644 index 0000000..e60758f --- /dev/null +++ b/inst/extdata/custom_constraints.yaml @@ -0,0 +1,119 @@ +# custom_constraints.yaml +# +# Constraints that cannot be expressed natively in Frictionless Table Schema. +# These supplement constraints declared in datapackage.json. +# +# Categories +# +# composite_constraints: +# Cross-field rules within a table +# +# custom_constraints: +# Cross-table lookup rules requiring joins +# +# Sources: +# - BETYdb constraints documentation +# - Constraints spreadsheet +# - PostgreSQL structure.sql and trigger definitions + +sites: + composite_constraints: + + - id: soil_fraction_sum + type: sum_limit + columns: [sand_pct, clay_pct] + operator: "<=" + value: 100 + message: "sand_pct + clay_pct must not exceed 100" + + - id: geometry_co_specification + type: all_or_none + columns: [geometry] + description: > + lat, lon, and masl must all be specified together or not at all. + Enforced indirectly through geometry representation in CSV data. + message: "lat, lon, and masl must all be present together" + + +traits: + composite_constraints: + + - id: stat_requires_statname + type: conditional + if: + column: stat + condition: not_null + then: + column: statname + condition: not_null_and_not_empty + message: "statname is required when stat is provided" + + - id: statname_requires_stat + type: conditional + if: + column: statname + condition: not_null_and_not_empty + then: + column: stat + condition: not_null + message: "stat is required when statname is provided" + + + custom_constraints: + + - id: trait_mean_in_variable_range + type: conditional_range + column: mean + lookup: + table: variables + key_col: name + min_col: min + max_col: max + join_on: trait + null_means_no_limit: true + message: > + trait mean must fall within the min/max range defined for the linked + variable + + +cultivars: + composite_constraints: + + - id: unique_name_per_species + type: unique_combination + columns: [name, specie_id] + message: "Cultivar name must be unique within a species" + + +citations: + composite_constraints: + + - id: unique_citation_identity + type: unique_combination + columns: [author, year, title] + message: "citation author-year-title combination must be unique" + + +methods: + composite_constraints: + + - id: unique_method_per_citation + type: unique_combination + columns: [name, citation_id] + message: "method name must be unique within citation" + + +managements: + composite_constraints: + + - id: level_requires_units + type: conditional + if: + column: level + condition: not_null + then: + column: units + condition: not_null_and_not_empty + message: "units required when level is provided" + + diff --git a/man/load_custom_constraints.Rd b/man/load_custom_constraints.Rd new file mode 100644 index 0000000..c77fca2 --- /dev/null +++ b/man/load_custom_constraints.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/validate_custom.R +\name{load_custom_constraints} +\alias{load_custom_constraints} +\title{Load the custom constraints definition file} +\usage{ +load_custom_constraints(path = "data-raw/custom_constraints.yaml") +} +\arguments{ +\item{path}{Path to custom_constraints.yaml. During development, this should +point to data-raw/custom_constraints.yaml. After package installation, +it points to system.file("extdata", "custom_constraints.yaml", package = "betydata").} +} +\value{ +A named list parsed from the YAML file. +} +\description{ +Load the custom constraints definition file +} +\keyword{internal} diff --git a/man/run_composite_constraints.Rd b/man/run_composite_constraints.Rd new file mode 100644 index 0000000..de0479c --- /dev/null +++ b/man/run_composite_constraints.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/validate_custom.R +\name{run_composite_constraints} +\alias{run_composite_constraints} +\title{Dispatch and run all composite_constraints for a single table} +\usage{ +run_composite_constraints(df, constraints) +} +\arguments{ +\item{df}{data frame for the table being validated} + +\item{constraints}{the composite_constraints list from the YAML for this table} +} +\value{ +character vector of all error messages +} +\description{ +Dispatch and run all composite_constraints for a single table +} +\keyword{internal} diff --git a/man/run_conditional.Rd b/man/run_conditional.Rd new file mode 100644 index 0000000..369e897 --- /dev/null +++ b/man/run_conditional.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/validate_custom.R +\name{run_conditional} +\alias{run_conditional} +\title{Run a conditional constraint (if field A is set, field B must also be set)} +\usage{ +run_conditional(df, con) +} +\arguments{ +\item{df}{data frame} + +\item{con}{a single composite_constraint list element of type "conditional"} +} +\value{ +character vector of error messages (empty if all rows pass) +} +\description{ +Run a conditional constraint (if field A is set, field B must also be set) +} +\keyword{internal} diff --git a/man/run_conditional_range.Rd b/man/run_conditional_range.Rd new file mode 100644 index 0000000..7e2749b --- /dev/null +++ b/man/run_conditional_range.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/validate_custom.R +\name{run_conditional_range} +\alias{run_conditional_range} +\title{Run a conditional_range constraint (cross-table lookup)} +\usage{ +run_conditional_range(df, lookup_df, con) +} +\arguments{ +\item{df}{data frame for the table being validated (e.g. traitsview)} + +\item{lookup_df}{data frame of the lookup table (e.g. variables)} + +\item{con}{a single custom_constraint list element of type +"conditional_range"} +} +\value{ +character vector of error messages (empty if all rows pass) +} +\description{ +Checks that a column's values fall within the min/max bounds defined for +each row's linked record in another table. Replicates the logic of the +PostgreSQL restrict_trait_range trigger. +} +\details{ +OPTIMIZED: Uses vector operations and match() instead of merge() to avoid +creating large intermediate dataframes. +} +\keyword{internal} diff --git a/man/run_custom_constraints.Rd b/man/run_custom_constraints.Rd new file mode 100644 index 0000000..1158cbd --- /dev/null +++ b/man/run_custom_constraints.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/validate_custom.R +\name{run_custom_constraints} +\alias{run_custom_constraints} +\title{Dispatch and run all custom_constraints for a single table} +\usage{ +run_custom_constraints(tables, table_name, constraints) +} +\arguments{ +\item{tables}{named list of all data frames (all tables loaded)} + +\item{table_name}{name of the table currently being validated} + +\item{constraints}{the custom_constraints list from the YAML for this table} +} +\value{ +character vector of all error messages +} +\description{ +Dispatch and run all custom_constraints for a single table +} +\keyword{internal} diff --git a/man/run_sum_limit.Rd b/man/run_sum_limit.Rd new file mode 100644 index 0000000..8fe1096 --- /dev/null +++ b/man/run_sum_limit.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/validate_custom.R +\name{run_sum_limit} +\alias{run_sum_limit} +\title{Run a sum_limit constraint} +\usage{ +run_sum_limit(df, con) +} +\arguments{ +\item{df}{data frame} + +\item{con}{a single composite_constraint list element of type "sum_limit"} +} +\value{ +character vector of error messages (empty if all rows pass) +} +\description{ +Checks that the row-wise sum of specified columns does not exceed a value. +Example: sand_pct + clay_pct <= 100 +} +\keyword{internal} diff --git a/man/run_unique_combination.Rd b/man/run_unique_combination.Rd new file mode 100644 index 0000000..8a4fda3 --- /dev/null +++ b/man/run_unique_combination.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/validate_custom.R +\name{run_unique_combination} +\alias{run_unique_combination} +\title{Run a unique_combination constraint} +\usage{ +run_unique_combination(df, con) +} +\arguments{ +\item{df}{data frame} + +\item{con}{a single composite_constraint list element of type +"unique_combination"} +} +\value{ +character vector of error messages (empty if all rows pass) +} +\description{ +Checks that no two rows share the same values in ALL specified columns. +Example: (name, specie_id) must be unique in cultivars. +} +\keyword{internal} diff --git a/man/validate_all.Rd b/man/validate_all.Rd new file mode 100644 index 0000000..622f024 --- /dev/null +++ b/man/validate_all.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/validate.R +\name{validate_all} +\alias{validate_all} +\title{Run all validation layers and report results} +\usage{ +validate_all( + tables, + datapackage_path = "datapackage.json", + constraints_path = "data-raw/custom_constraints.yaml", + stop_on_error = TRUE +) +} +\arguments{ +\item{tables}{Named list of data frames. Names must match resource names +in datapackage.json and table names in custom_constraints.yaml. +Only include tables that have constraints; exclude junction tables.} + +\item{datapackage_path}{Path to datapackage.json (default: repo root).} + +\item{constraints_path}{Path to custom_constraints.yaml.} + +\item{stop_on_error}{If TRUE (default), calls stop() when errors are found. +Set to FALSE to return results without stopping (useful in tests).} +} +\value{ +Invisibly returns a named list of error vectors per table. +Empty list means all checks passed. +} +\description{ +This is the main entry point. It runs both the Frictionless-native +constraint checks and the custom constraint checks, then collates +results and either stops (on hard errors) or messages warnings. +} diff --git a/man/validate_custom.Rd b/man/validate_custom.Rd new file mode 100644 index 0000000..98d51e2 --- /dev/null +++ b/man/validate_custom.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/validate_custom.R +\name{validate_custom} +\alias{validate_custom} +\title{Run all custom (non-Frictionless) constraints across all loaded tables} +\usage{ +validate_custom(tables, constraints_path = "data-raw/custom_constraints.yaml") +} +\arguments{ +\item{tables}{Named list of data frames. Names must match table names used +in the YAML (e.g. "sites", "traitsview", "cultivars", "variables"). +Should ONLY include tables that have constraints; exclude junction tables.} + +\item{constraints_path}{Path to custom_constraints.yaml. Defaults to +data-raw/custom_constraints.yaml (development) or installed package path.} +} +\value{ +Named list of character vectors. Each name is a table that had +errors; each value is a vector of error messages. Tables with no errors +are omitted from the output. +} +\description{ +Loads the constraint definitions from the YAML file and applies each +applicable constraint to the corresponding data frame in \code{tables}. +} diff --git a/man/validate_foreign_keys.Rd b/man/validate_foreign_keys.Rd new file mode 100644 index 0000000..9ae84a8 --- /dev/null +++ b/man/validate_foreign_keys.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/validate.R +\name{validate_foreign_keys} +\alias{validate_foreign_keys} +\title{Validate foreign key referential integrity across tables} +\usage{ +validate_foreign_keys(tables, schema, table_name) +} +\arguments{ +\item{tables}{named list of all loaded data frames} + +\item{schema}{the schema list for the child resource} + +\item{table_name}{name of the child table} +} +\value{ +character vector of error messages +} +\description{ +OPTIMIZED: Uses fast set membership checking instead of row-by-row comparison. +} +\keyword{internal} diff --git a/man/validate_frictionless_fields.Rd b/man/validate_frictionless_fields.Rd new file mode 100644 index 0000000..f217c96 --- /dev/null +++ b/man/validate_frictionless_fields.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/validate.R +\name{validate_frictionless_fields} +\alias{validate_frictionless_fields} +\title{Validate Frictionless-native field-level constraints from datapackage.json} +\usage{ +validate_frictionless_fields(df, schema, table_name) +} +\arguments{ +\item{df}{data frame to validate} + +\item{schema}{the "schema" list for this resource from datapackage.json} + +\item{table_name}{table name string for error messages} +} +\value{ +character vector of error messages (empty if all pass) +} +\description{ +Reads the schema from datapackage.json and checks each field's declared +constraints (required, minimum, maximum, enum, unique) against the +supplied data frame. Foreign key and primaryKey checks are handled +separately in validate_foreign_keys(). +} +\keyword{internal} diff --git a/man/validate_frictionless_layer.Rd b/man/validate_frictionless_layer.Rd new file mode 100644 index 0000000..84ae669 --- /dev/null +++ b/man/validate_frictionless_layer.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/validate.R +\name{validate_frictionless_layer} +\alias{validate_frictionless_layer} +\title{Run all Frictionless-layer validation across all loaded tables} +\usage{ +validate_frictionless_layer(tables, datapackage_path = "datapackage.json") +} +\arguments{ +\item{tables}{named list of data frames (ONLY constrained tables)} + +\item{datapackage_path}{path to datapackage.json} +} +\value{ +named list of character vectors; each name is a table with errors +} +\description{ +Run all Frictionless-layer validation across all loaded tables +} +\keyword{internal} diff --git a/man/validate_primary_key.Rd b/man/validate_primary_key.Rd new file mode 100644 index 0000000..bb4c9ff --- /dev/null +++ b/man/validate_primary_key.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/validate.R +\name{validate_primary_key} +\alias{validate_primary_key} +\title{Validate primaryKey uniqueness for a table} +\usage{ +validate_primary_key(df, schema, table_name) +} +\arguments{ +\item{df}{data frame} + +\item{schema}{the schema list for this resource} + +\item{table_name}{string for error messages} +} +\value{ +character vector of error messages +} +\description{ +Validate primaryKey uniqueness for a table +} +\keyword{internal} diff --git a/tests/testthat/test_constraints.R b/tests/testthat/test_constraints.R new file mode 100644 index 0000000..998bd93 --- /dev/null +++ b/tests/testthat/test_constraints.R @@ -0,0 +1,543 @@ +# tests/testthat/test-constraints.R +# +# Tests for BETYdb data constraint validators. +# Covers both Frictionless-layer checks (via validate_frictionless_fields, +# validate_primary_key) and custom-layer checks (via run_composite_constraints, +# run_custom_constraints). +# +# Tests follow the pattern: one test for a valid case (should produce zero +# errors), one or more tests for each violation type. + +# Source the validators directly so tests do not depend on package install state. +# In CI these files are at the repo root relative paths. + +# =========================================================================== +# SITES — composite constraints +# =========================================================================== + +test_that("sites: valid data produces no errors", { + df <- data.frame( + sitename = "Test Farm", + lat = 45.0, + lon = -93.0, + sand_pct = 40, + clay_pct = 30, + mat = 10, + map = 800, + stringsAsFactors = FALSE + ) + con <- list(list( + id = "soil_fraction_sum", + type = "sum_limit", + columns = c("sand_pct", "clay_pct"), + operator = "<=", + value = 100, + message = "sand_pct + clay_pct must not exceed 100" + )) + expect_length(run_composite_constraints(df, con), 0) +}) + +test_that("sites: sand_pct + clay_pct > 100 is caught", { + df <- data.frame(sitename = "Bad Site", sand_pct = 70, clay_pct = 50, + stringsAsFactors = FALSE) + con <- list(list( + id = "soil_fraction_sum", + type = "sum_limit", + columns = c("sand_pct", "clay_pct"), + operator = "<=", + value = 100, + message = "sand_pct + clay_pct must not exceed 100" + )) + errors <- run_composite_constraints(df, con) + expect_gt(length(errors), 0) + expect_true(any(grepl("soil_fraction_sum", errors))) +}) + +test_that("sites: sand_pct + clay_pct exactly 100 passes", { + df <- data.frame(sand_pct = 60, clay_pct = 40, stringsAsFactors = FALSE) + con <- list(list( + id = "soil_fraction_sum", type = "sum_limit", + columns = c("sand_pct", "clay_pct"), operator = "<=", value = 100, + message = "sand_pct + clay_pct must not exceed 100" + )) + expect_length(run_composite_constraints(df, con), 0) +}) + +test_that("sites: NA in sand_pct or clay_pct skips the row silently", { + df <- data.frame(sand_pct = NA_real_, clay_pct = 90, stringsAsFactors = FALSE) + con <- list(list( + id = "soil_fraction_sum", type = "sum_limit", + columns = c("sand_pct", "clay_pct"), operator = "<=", value = 100, + message = "sand_pct + clay_pct must not exceed 100" + )) + # Row with NA in any summed column is skipped; no error expected + expect_length(run_composite_constraints(df, con), 0) +}) + +# =========================================================================== +# SITES — Frictionless field constraints (lat, lon, mat, map, sitename) +# =========================================================================== + +test_that("sites: lat out of range [-90, 90] is caught", { + schema <- list(fields = list( + list(name = "lat", type = "number", + constraints = list(minimum = -90, maximum = 90)) + )) + df <- data.frame(lat = 200, stringsAsFactors = FALSE) + errors <- validate_frictionless_fields(df, schema, "sites") + expect_gt(length(errors), 0) + expect_true(any(grepl("lat", errors))) +}) + +test_that("sites: lon out of range [-180, 180] is caught", { + schema <- list(fields = list( + list(name = "lon", type = "number", + constraints = list(minimum = -180, maximum = 180)) + )) + df <- data.frame(lon = -200, stringsAsFactors = FALSE) + errors <- validate_frictionless_fields(df, schema, "sites") + expect_gt(length(errors), 0) + expect_true(any(grepl("lon", errors))) +}) + +test_that("sites: sitename required — NA value is caught", { + schema <- list(fields = list( + list(name = "sitename", type = "string", + constraints = list(required = TRUE)) + )) + df <- data.frame(sitename = NA_character_, stringsAsFactors = FALSE) + errors <- validate_frictionless_fields(df, schema, "sites") + expect_gt(length(errors), 0) + expect_true(any(grepl("sitename", errors))) +}) + +test_that("sites: mat out of range is caught", { + schema <- list(fields = list( + list(name = "mat", type = "number", + constraints = list(minimum = -25, maximum = 40)) + )) + df <- data.frame(mat = 100, stringsAsFactors = FALSE) + errors <- validate_frictionless_fields(df, schema, "sites") + expect_gt(length(errors), 0) +}) + +test_that("sites: valid lat/lon/sitename passes all Frictionless checks", { + schema <- list(fields = list( + list(name = "sitename", type = "string", + constraints = list(required = TRUE)), + list(name = "lat", type = "number", + constraints = list(minimum = -90, maximum = 90)), + list(name = "lon", type = "number", + constraints = list(minimum = -180, maximum = 180)) + )) + df <- data.frame(sitename = "Good Site", lat = 45, lon = -93, + stringsAsFactors = FALSE) + expect_length(validate_frictionless_fields(df, schema, "sites"), 0) +}) + +# =========================================================================== +# TRAITS — conditional stat/statname pairing +# =========================================================================== + +test_that("traits: stat without statname is caught", { + df <- data.frame(mean = 5.0, stat = 0.5, statname = NA_character_, + stringsAsFactors = FALSE) + con <- list(list( + id = "stat_requires_statname", + type = "conditional", + `if` = list(column = "stat", condition = "not_null"), + then = list(column = "statname", condition = "not_null_and_not_empty"), + message = "statname is required when stat is provided" + )) + errors <- run_composite_constraints(df, con) + expect_gt(length(errors), 0) + expect_true(any(grepl("stat_requires_statname", errors))) +}) + +test_that("traits: statname without stat is caught", { + df <- data.frame(mean = 5.0, stat = NA_real_, statname = "SE", + stringsAsFactors = FALSE) + con <- list(list( + id = "statname_requires_stat", + type = "conditional", + `if` = list(column = "statname", condition = "not_null_and_not_empty"), + then = list(column = "stat", condition = "not_null"), + message = "stat is required when statname is provided" + )) + errors <- run_composite_constraints(df, con) + expect_gt(length(errors), 0) +}) + +test_that("traits: stat and statname both present passes", { + df <- data.frame(mean = 5.0, stat = 0.5, statname = "SE", + stringsAsFactors = FALSE) + con <- list(list( + id = "stat_requires_statname", + type = "conditional", + `if` = list(column = "stat", condition = "not_null"), + then = list(column = "statname", condition = "not_null_and_not_empty"), + message = "statname is required when stat is provided" + )) + expect_length(run_composite_constraints(df, con), 0) +}) + +test_that("traits: both stat and statname NA passes (neither required)", { + df <- data.frame(mean = 5.0, stat = NA_real_, statname = NA_character_, + stringsAsFactors = FALSE) + con <- list(list( + id = "stat_requires_statname", + type = "conditional", + `if` = list(column = "stat", condition = "not_null"), + then = list(column = "statname", condition = "not_null_and_not_empty"), + message = "statname is required when stat is provided" + )) + expect_length(run_composite_constraints(df, con), 0) +}) + +# TRAITS — Frictionless field constraints (mean required, access_level enum) + +test_that("traits: mean required — NA is caught", { + schema <- list(fields = list( + list(name = "mean", type = "number", constraints = list(required = TRUE)) + )) + df <- data.frame(mean = NA_real_, stringsAsFactors = FALSE) + errors <- validate_frictionless_fields(df, schema, "traits") + expect_gt(length(errors), 0) + expect_true(any(grepl("mean", errors))) +}) + +# =========================================================================== +# TRAITS — cross-table range check (trait mean within variable min/max) +# =========================================================================== + +test_that("traits: mean within variable range passes", { + traits <- data.frame(mean = 50, trait = "SLA", stringsAsFactors = FALSE) + variables <- data.frame(name = "SLA", min = "0", max = "100", stringsAsFactors = FALSE) + tables <- list(traitsview = traits, variables = variables) + + con <- list(list( + id = "trait_mean_in_variable_range", + type = "conditional_range", + column = "mean", + lookup = list(table = "variables", key_col = "name", + min_col = "min", max_col = "max"), + join_on = "trait", + null_means_no_limit = TRUE, + message = "trait mean must fall within variable min/max" + )) + errors <- run_custom_constraints(tables, "traitsview", con) + expect_length(errors, 0) +}) + +test_that("traits: mean below variable min is caught", { + traits <- data.frame(mean = -5, trait = "SLA", stringsAsFactors = FALSE) + variables <- data.frame(name = "SLA", min = "0", max = "100", stringsAsFactors = FALSE) + tables <- list(traitsview = traits, variables = variables) + + con <- list(list( + id = "trait_mean_in_variable_range", + type = "conditional_range", + column = "mean", + lookup = list(table = "variables", key_col = "name", + min_col = "min", max_col = "max"), + join_on = "trait", + null_means_no_limit = TRUE, + message = "trait mean must fall within variable min/max" + )) + errors <- run_custom_constraints(tables, "traitsview", con) + expect_gt(length(errors), 0) + expect_true(any(grepl("trait_mean_in_variable_range", errors))) +}) + +test_that("traits: mean above variable max is caught", { + traits <- data.frame(mean = 200, trait = "SLA", stringsAsFactors = FALSE) + variables <- data.frame(name = "SLA", min = "0", max = "100", stringsAsFactors = FALSE) + tables <- list(traitsview = traits, variables = variables) + + con <- list(list( + id = "trait_mean_in_variable_range", + type = "conditional_range", + column = "mean", + lookup = list(table = "variables", key_col = "name", + min_col = "min", max_col = "max"), + join_on = "trait", + null_means_no_limit = TRUE, + message = "trait mean must fall within variable min/max" + )) + errors <- run_custom_constraints(tables, "traitsview", con) + expect_gt(length(errors), 0) +}) + +test_that("traits: NULL variable min treated as -Infinity (no lower limit)", { + traits <- data.frame(mean = -9999, trait = "SLA", stringsAsFactors = FALSE) + variables <- data.frame(name = "SLA", min = NA_character_, max = "100", + stringsAsFactors = FALSE) + tables <- list(traitsview = traits, variables = variables) + + con <- list(list( + id = "trait_mean_in_variable_range", + type = "conditional_range", + column = "mean", + lookup = list(table = "variables", key_col = "name", + min_col = "min", max_col = "max"), + join_on = "trait", + null_means_no_limit = TRUE, + message = "trait mean must fall within variable min/max" + )) + # With null_means_no_limit = TRUE, NA min means no lower bound; should pass + errors <- run_custom_constraints(tables, "traitsview", con) + expect_length(errors, 0) +}) + +# =========================================================================== +# CULTIVARS — unique_combination constraint +# =========================================================================== + +test_that("cultivars: duplicate name + specie_id is caught", { + df <- data.frame( + name = c("Alpha", "Alpha"), + specie_id = c(1, 1), + stringsAsFactors = FALSE + ) + con <- list(list( + id = "unique_name_per_species", + type = "unique_combination", + columns = c("name", "specie_id"), + message = "Cultivar name must be unique within a species" + )) + errors <- run_composite_constraints(df, con) + expect_gt(length(errors), 0) + expect_true(any(grepl("unique_name_per_species", errors))) +}) + +test_that("cultivars: same name, different species passes", { + df <- data.frame( + name = c("Alpha", "Alpha"), + specie_id = c(1, 2), + stringsAsFactors = FALSE + ) + con <- list(list( + id = "unique_name_per_species", + type = "unique_combination", + columns = c("name", "specie_id"), + message = "Cultivar name must be unique within a species" + )) + expect_length(run_composite_constraints(df, con), 0) +}) + +test_that("cultivars: required fields name and specie_id are checked", { + schema <- list(fields = list( + list(name = "name", type = "string", constraints = list(required = TRUE)), + list(name = "specie_id", type = "number", constraints = list(required = TRUE)) + )) + df <- data.frame(name = NA_character_, specie_id = 1, stringsAsFactors = FALSE) + errors <- validate_frictionless_fields(df, schema, "cultivars") + expect_gt(length(errors), 0) + expect_true(any(grepl("name", errors))) +}) + +# =========================================================================== +# PRIMARY KEY uniqueness +# =========================================================================== + +test_that("primaryKey: duplicate id is caught", { + schema <- list(primaryKey = list("id")) + df <- data.frame(id = c(1, 1, 2), stringsAsFactors = FALSE) + errors <- validate_primary_key(df, schema, "sites") + expect_gt(length(errors), 0) + expect_true(any(grepl("primaryKey", errors))) +}) + +test_that("primaryKey: unique ids pass", { + schema <- list(primaryKey = list("id")) + df <- data.frame(id = c(1, 2, 3), stringsAsFactors = FALSE) + expect_length(validate_primary_key(df, schema, "sites"), 0) +}) + +test_that("primaryKey: compound key duplicate is caught", { + schema <- list(primaryKey = list("pft_id", "specie_id")) + df <- data.frame( + pft_id = c(1, 1, 2), + specie_id = c(10, 10, 10), + stringsAsFactors = FALSE + ) + errors <- validate_primary_key(df, schema, "pfts_species") + expect_gt(length(errors), 0) +}) + +# =========================================================================== +# VALIDATE_ALL integration — stop_on_error = FALSE returns results +# =========================================================================== + +test_that("validate_all returns empty list when no errors", { + # Minimal valid tables with no violations + tables <- list( + sites = data.frame( + id = 1L, sitename = "Good Site", lat = 45, lon = -93, + sand_pct = 30, clay_pct = 20, + stringsAsFactors = FALSE + ) + ) + # Use a constraints path that exists; skip datapackage check via a fake path + constraints_path <- system.file("extdata", "custom_constraints.yaml", + package = "betydata") + skip_if_not(nzchar(constraints_path) && file.exists(constraints_path), + "custom_constraints.yaml not installed; skipping integration test") + + result <- validate_all( + tables = tables, + datapackage_path = "nonexistent_datapackage.json", # skip Frictionless layer + constraints_path = constraints_path, + stop_on_error = FALSE + ) + expect_length(result, 0) +}) + +test_that("validate_all returns errors without stopping when stop_on_error = FALSE", { + tables <- list( + sites = data.frame( + id = 1L, sitename = "Bad Site", + sand_pct = 80, clay_pct = 80, # sum > 100 — violation + stringsAsFactors = FALSE + ) + ) + constraints_path <- system.file("extdata", "custom_constraints.yaml", + package = "betydata") + skip_if_not(nzchar(constraints_path) && file.exists(constraints_path), + "custom_constraints.yaml not installed; skipping integration test") + + result <- validate_all( + tables = tables, + datapackage_path = "nonexistent_datapackage.json", + constraints_path = constraints_path, + stop_on_error = FALSE + ) + expect_gt(length(result), 0) + expect_true("sites" %in% names(result)) +}) +# Test 1: validate_all stops on hard errors +test_that("validate_all stops when stop_on_error = TRUE and errors exist", { + tables <- list( + sites = data.frame(sitename = NA_character_, sand_pct = 80, clay_pct = 80) + ) + constraints_path <- system.file("extdata", "custom_constraints.yaml", + package = "betydata") + skip_if_not(file.exists(constraints_path)) + expect_error( + validate_all(tables, datapackage_path = "nonexistent.json", + constraints_path = constraints_path, stop_on_error = TRUE) + ) +}) + +# Test 2: YAML file loads correctly +test_that("custom_constraints.yaml loads without error", { + path <- system.file("extdata", "custom_constraints.yaml", + package = "betydata") + skip_if_not(file.exists(path)) + rules <- yaml::read_yaml(path) + expect_true("sites" %in% names(rules)) + expect_true("traits" %in% names(rules)) + expect_true("cultivars" %in% names(rules)) +}) + +# Test 3: Real packaged data passes all constraints +test_that("packaged sites data passes all constraints", { + + data("sites", package = "betydata") + + expect_false(any(is.na(sites$sitename))) + + both_present <- !is.na(sites$sand_pct) & + !is.na(sites$clay_pct) + + if (any(both_present)) { + sums <- sites$sand_pct[both_present] + + sites$clay_pct[both_present] + + expect_true(all(sums <= 100)) + } + + valid_mat <- sites$mat[!is.na(sites$mat)] + if(length(valid_mat) > 0){ + expect_true(all(valid_mat >= -25 & + valid_mat <= 40)) + } + + valid_map <- sites$map[!is.na(sites$map)] + if(length(valid_map) > 0){ + expect_true(all(valid_map >= 0 & + valid_map <= 12000)) + } +}) +# =========================================================================== +# NEW CONSTRAINTS ADDED IN PR2 +# =========================================================================== + +test_that("citations: duplicate author-year-title is caught", { + + df <- data.frame( + author = c("Smith","Smith"), + year = c(2020,2020), + title = c("Paper","Paper"), + stringsAsFactors = FALSE + ) + + con <- list(list( + id="unique_citation_identity", + type="unique_combination", + columns=c("author","year","title"), + message="citation author-year-title combination must be unique" + )) + + errors <- run_composite_constraints(df, con) + + expect_gt(length(errors),0) +}) + + +test_that("methods: duplicate name within citation is caught", { + + df <- data.frame( + name=c("MethodA","MethodA"), + citation_id=c(1,1), + stringsAsFactors=FALSE + ) + + con <- list(list( + id="unique_method_per_citation", + type="unique_combination", + columns=c("name","citation_id"), + message="method name must be unique within citation" + )) + + errors <- run_composite_constraints(df, con) + + expect_gt(length(errors),0) +}) + + +test_that("managements: level requires units is caught", { + + df <- data.frame( + level=5, + units=NA_character_, + stringsAsFactors=FALSE + ) + + con <- list(list( + id="level_requires_units", + type="conditional", + `if`=list( + column="level", + condition="not_null" + ), + then=list( + column="units", + condition="not_null_and_not_empty" + ), + message="units required when level is provided" + )) + errors <- run_composite_constraints(df, con) + + expect_gt(length(errors),0) +}) \ No newline at end of file