Audit IUCN Range Map Geometry Issues

Identify missing rng_iucn species and diagnose curved geometry failures

Published

2026-03-24 18:16:35

1 Overview

The IUCN shapefile ingestion (merge_models_prep.qmd) skips species whose geometries remain invalid after st_make_valid(). Some IUCN shapefiles contain curved geometry types (COMPOUNDCURVE, CIRCULARSTRING, CURVEPOLYGON, MULTISURFACE) that require special handling — the same issue solved for BirdLife data in ingest_birdlife.org_botw.qmd via WKT text replacement.

This notebook:

  1. Reads each IUCN shapefile and inspects geometry types
  2. Compares species in shapefiles against rng_iucn entries in the database
  3. Tests the fix_curved_geom() fix on missing species
  4. Produces a summary of all missing species with diagnosed reasons
Code
librarian::shelf(
  DBI,
  dplyr,
  duckdb,
  DT,
  fs,
  glue,
  here,
  purrr,
  readr,
  sf,
  stringr,
  tibble,
  tidyr,
  quiet = T
)

knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)

source(here("libs/paths.R"))

con_sdm <- dbConnect(duckdb(dbdir = sdm_db, read_only = TRUE))

d_taxon <- tbl(con_sdm, "taxon") |> collect()

dir_iucn_raw <- glue("{dir_data}/raw/iucnredlist.org")
dir_iucn_derived <- glue("{dir_data}/derived/iucnredlist.org")

rng_shps <- list.files(
  dir_iucn_raw,
  pattern = "\\.shp$",
  recursive = TRUE,
  full.names = TRUE
)

sf_use_s2(FALSE)

2 1. Shapefile Geometry Inventory

Read each IUCN shapefile and catalog the geometry types per species.

Code
# helper: fix curved geometries (from ingest_birdlife.org_botw.qmd surf2poly)
fix_curved_geom <- function(obj) {
  geom_types <- st_geometry_type(st_geometry(obj)) |> as.character() |> unique()
  curved_types <- c(
    "MULTISURFACE",
    "COMPOUNDCURVE",
    "CURVEPOLYGON",
    "CIRCULARSTRING"
  )

  has_curved <- any(curved_types %in% geom_types)
  if (!has_curved) {
    return(obj)
  }

  out_geom_txt <- obj |>
    sf::st_geometry() |>
    sf::st_as_text() |>
    gsub(pattern = "MULTISURFACE (", replacement = "", fixed = TRUE) |>
    gsub(pattern = "COMPOUNDCURVE (", replacement = "", fixed = TRUE) |>
    gsub(pattern = "CURVEPOLYGON (", replacement = "", fixed = TRUE)

  out <- obj |>
    sf::st_set_geometry(
      value = sf::st_as_sfc(
        out_geom_txt,
        crs = sf::st_crs(obj)
      )
    )

  idx <- which(sf::st_geometry_type(out) == "LINESTRING")

  if (length(idx) > 0) {
    out_ok <- out[-idx, ]
    out_line <- out |>
      dplyr::slice(idx) |>
      sf::st_cast(to = "POLYGON") |>
      sf::st_cast(to = "MULTIPOLYGON")
    out <- rbind(out_ok, out_line)
  }

  out |>
    sf::st_cast("MULTIPOLYGON") |>
    sf::st_make_valid()
}

# inventory geometry types per shapefile and species
d_geom_inventory <- tibble()

for (i in seq_along(rng_shps)) {
  rng_shp <- rng_shps[i]
  grp <- basename(rng_shp) |> str_replace_all("\\.shp$", "")

  message(glue("reading {grp} ({i}/{length(rng_shps)})..."))

  a <- tryCatch(read_sf(rng_shp), error = function(e) NULL)
  if (is.null(a)) {
    message(glue("  ERROR: could not read {grp}"))
    next
  }

  # check for required fields
  if (!"sci_name" %in% names(a)) {
    message(glue("  SKIP: no sci_name field in {grp}"))
    next
  }

  # get geometry type per row
  geom_types <- st_geometry_type(a) |> as.character()

  d_grp <- a |>
    st_drop_geometry() |>
    mutate(
      geom_type = geom_types,
      grp = grp
    ) |>
    select(grp, sci_name, any_of(c("marine", "id_no")), geom_type)

  d_geom_inventory <- bind_rows(d_geom_inventory, d_grp)
}

# summarize geometry types per species per group
d_geom_summary <- d_geom_inventory |>
  group_by(grp, sci_name) |>
  summarize(
    n_rows = n(),
    geom_types = paste(sort(unique(geom_type)), collapse = ", "),
    has_curved = any(
      geom_type %in%
        c(
          "MULTISURFACE",
          "COMPOUNDCURVE",
          "CURVEPOLYGON",
          "CIRCULARSTRING"
        )
    ),
    .groups = "drop"
  )

# show species with curved geometries
d_curved <- d_geom_summary |>
  filter(has_curved)

datatable(
  d_curved,
  caption = glue(
    "{nrow(d_curved)} species with curved geometries across all IUCN shapefiles"
  ),
  filter = "top",
  options = list(dom = "ft", pageLength = 50)
)

2.1 All geometry types found

Code
d_geom_inventory |>
  count(grp, geom_type, name = "n_rows") |>
  arrange(grp, geom_type) |>
  datatable(
    caption = "geometry types per shapefile group",
    filter = "top",
    options = list(dom = "ft", pageLength = 50)
  )

3 2. Compare Shapefiles to Database

Identify species present in shapefiles (after marine filter + taxon name match) but missing from the rng_iucn dataset in the database.

Code
# species with rng_iucn in database
d_rng_iucn_db <- tbl(con_sdm, "taxon_model") |>
  filter(ds_key == "rng_iucn") |>
  collect() |>
  left_join(
    d_taxon |> select(taxon_id, scientific_name),
    by = "taxon_id"
  )

# species in shapefiles that match taxon table and have marine = true
d_shp_marine <- d_geom_inventory |>
  filter(str_to_lower(marine) %in% c("true", "t")) |>
  inner_join(
    d_taxon |> select(taxon_id, scientific_name),
    by = c("sci_name" = "scientific_name")
  ) |>
  distinct(
    grp,
    sci_name,
    taxon_id,
    has_curved = geom_type %in%
      c(
        "MULTISURFACE",
        "COMPOUNDCURVE",
        "CURVEPOLYGON",
        "CIRCULARSTRING"
      )
  )

# summarize per species (some may have mixed geometry types across rows)
d_shp_spp <- d_shp_marine |>
  group_by(grp, sci_name, taxon_id) |>
  summarize(
    has_curved = any(has_curved),
    .groups = "drop"
  )

# find species in shapefiles but NOT in database
d_missing <- d_shp_spp |>
  anti_join(
    d_rng_iucn_db,
    by = "taxon_id"
  ) |>
  left_join(
    d_geom_summary |> select(grp, sci_name, geom_types),
    by = c("grp", "sci_name")
  ) |>
  arrange(grp, sci_name)

# also check for species that have marine != true but are in taxon table
d_marine_mismatch <- d_geom_inventory |>
  filter(!str_to_lower(marine) %in% c("true", "t")) |>
  inner_join(
    d_taxon |> select(taxon_id, scientific_name),
    by = c("sci_name" = "scientific_name")
  ) |>
  anti_join(
    d_rng_iucn_db,
    by = "taxon_id"
  ) |>
  distinct(grp, sci_name, taxon_id, marine)

datatable(
  d_missing,
  caption = glue(
    "{nrow(d_missing)} species in IUCN shapefiles but missing from rng_iucn database"
  ),
  filter = "top",
  options = list(dom = "ft", pageLength = 50)
)

3.1 Species excluded by marine filter

Code
datatable(
  d_marine_mismatch,
  caption = "species in taxon table but excluded by marine filter (marine != 'true'/'t')",
  filter = "top",
  options = list(dom = "ft", pageLength = 50)
)

4 3. Test Curved Geometry Fix

Apply fix_curved_geom() to shapefiles with curved geometries and verify the missing species become valid.

Code
# only test shapefiles that have curved species
grps_with_curved <- d_curved |> distinct(grp) |> pull(grp)

d_fix_results <- tibble()

for (grp in grps_with_curved) {
  rng_shp <- rng_shps[str_detect(rng_shps, glue("{grp}\\.shp$"))]
  if (length(rng_shp) == 0) {
    next
  }

  message(glue("testing fix for {grp}..."))
  a <- read_sf(rng_shp)

  # check validity before fix
  valid_before <- st_is_valid(a)

  # apply fix
  a_fixed <- tryCatch(
    fix_curved_geom(a),
    error = function(e) {
      message(glue("  fix_curved_geom() error: {e$message}"))
      NULL
    }
  )

  if (is.null(a_fixed)) {
    d_fix_results <- bind_rows(
      d_fix_results,
      tibble(
        grp = grp,
        n_before = nrow(a),
        n_invalid_before = sum(!valid_before),
        fix_error = TRUE,
        n_after = NA_integer_,
        n_invalid_after = NA_integer_
      )
    )
    next
  }

  # apply st_make_valid on the fixed result
  if (any(!st_is_valid(a_fixed))) {
    a_fixed <- st_make_valid(a_fixed)
  }

  valid_after <- st_is_valid(a_fixed)

  d_fix_results <- bind_rows(
    d_fix_results,
    tibble(
      grp = grp,
      n_before = nrow(a),
      n_invalid_before = sum(!valid_before),
      fix_error = FALSE,
      n_after = nrow(a_fixed),
      n_invalid_after = sum(!valid_after)
    )
  )
}

datatable(
  d_fix_results,
  caption = "results of fix_curved_geom() on shapefiles with curved geometries",
  filter = "top",
  options = list(dom = "ft")
)

4.1 Verify target species

Check that Phoca vitulina and Sotalia guianensis are now fixable.

Code
target_spp <- c("Phoca vitulina", "Sotalia guianensis")

d_target_status <- d_missing |>
  filter(sci_name %in% target_spp)

if (nrow(d_target_status) > 0) {
  cat("target species found in missing list:\n")
  print(d_target_status)
} else {
  cat(
    "target species NOT in missing list — they may already be in the database\n"
  )
  d_rng_iucn_db |>
    filter(scientific_name %in% target_spp) |>
    print()
}
target species NOT in missing list — they may already be in the database
# A tibble: 0 × 4
# ℹ 4 variables: taxon_id <dbl>, ds_key <chr>, mdl_seq <int>,
#   scientific_name <chr>

5 4. Summary

Code
# classify missing species by reason
d_missing_classified <- d_missing |>
  mutate(
    reason = case_when(
      has_curved ~ "curved geometry (fixable)",
      TRUE ~ "unknown — investigate"
    )
  )

cat(glue(
  "## missing species summary\n\n",
  "- total species in IUCN shapefiles (marine + taxon match): {nrow(d_shp_spp)}\n",
  "- species with rng_iucn in database: {nrow(d_rng_iucn_db)}\n",
  "- **missing from database: {nrow(d_missing)}**\n",
  "  - curved geometry (fixable): {sum(d_missing$has_curved)}\n",
  "  - other/unknown: {sum(!d_missing$has_curved)}\n",
  "- excluded by marine filter: {nrow(d_marine_mismatch)}\n\n"
))
## missing species summary

- total species in IUCN shapefiles (marine + taxon match): 2249
- species with rng_iucn in database: 1516
- **missing from database: 733**
  - curved geometry (fixable): 0
  - other/unknown: 733
- excluded by marine filter: 1
Code
datatable(
  d_missing_classified,
  caption = "all missing species with classified reason",
  filter = "top",
  options = list(dom = "ft", pageLength = 100)
)