---
title: "Audit IUCN Range Map Geometry Issues"
subtitle: "Identify missing rng_iucn species and diagnose curved geometry failures"
format:
html:
code-fold: true
code-tools: true
editor_options:
chunk_output_type: console
---
## Overview
The IUCN shapefile ingestion (`merge_models_prep.qmd`) skips species whose
geometries remain invalid after `st_make_valid()`. Some IUCN shapefiles contain
**curved geometry types** (COMPOUNDCURVE, CIRCULARSTRING, CURVEPOLYGON,
MULTISURFACE) that require special handling — the same issue solved for BirdLife
data in `ingest_birdlife.org_botw.qmd` via WKT text replacement.
This notebook:
1. Reads each IUCN shapefile and inspects geometry types
2. Compares species in shapefiles against rng_iucn entries in the database
3. Tests the `fix_curved_geom()` fix on missing species
4. Produces a summary of all missing species with diagnosed reasons
```{r}
#| label: setup
#| warning: false
librarian::shelf(
DBI,
dplyr,
duckdb,
DT,
fs,
glue,
here,
purrr,
readr,
sf,
stringr,
tibble,
tidyr,
quiet = T
)
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
source(here("libs/paths.R"))
con_sdm <- dbConnect(duckdb(dbdir = sdm_db, read_only = TRUE))
d_taxon <- tbl(con_sdm, "taxon") |> collect()
dir_iucn_raw <- glue("{dir_data}/raw/iucnredlist.org")
dir_iucn_derived <- glue("{dir_data}/derived/iucnredlist.org")
rng_shps <- list.files(
dir_iucn_raw,
pattern = "\\.shp$",
recursive = TRUE,
full.names = TRUE
)
sf_use_s2(FALSE)
```
## 1. Shapefile Geometry Inventory
Read each IUCN shapefile and catalog the geometry types per species.
```{r}
#| label: read_shapefiles
# helper: fix curved geometries (from ingest_birdlife.org_botw.qmd surf2poly)
fix_curved_geom <- function(obj) {
geom_types <- st_geometry_type(st_geometry(obj)) |> as.character() |> unique()
curved_types <- c(
"MULTISURFACE",
"COMPOUNDCURVE",
"CURVEPOLYGON",
"CIRCULARSTRING"
)
has_curved <- any(curved_types %in% geom_types)
if (!has_curved) {
return(obj)
}
out_geom_txt <- obj |>
sf::st_geometry() |>
sf::st_as_text() |>
gsub(pattern = "MULTISURFACE (", replacement = "", fixed = TRUE) |>
gsub(pattern = "COMPOUNDCURVE (", replacement = "", fixed = TRUE) |>
gsub(pattern = "CURVEPOLYGON (", replacement = "", fixed = TRUE)
out <- obj |>
sf::st_set_geometry(
value = sf::st_as_sfc(
out_geom_txt,
crs = sf::st_crs(obj)
)
)
idx <- which(sf::st_geometry_type(out) == "LINESTRING")
if (length(idx) > 0) {
out_ok <- out[-idx, ]
out_line <- out |>
dplyr::slice(idx) |>
sf::st_cast(to = "POLYGON") |>
sf::st_cast(to = "MULTIPOLYGON")
out <- rbind(out_ok, out_line)
}
out |>
sf::st_cast("MULTIPOLYGON") |>
sf::st_make_valid()
}
# inventory geometry types per shapefile and species
d_geom_inventory <- tibble()
for (i in seq_along(rng_shps)) {
rng_shp <- rng_shps[i]
grp <- basename(rng_shp) |> str_replace_all("\\.shp$", "")
message(glue("reading {grp} ({i}/{length(rng_shps)})..."))
a <- tryCatch(read_sf(rng_shp), error = function(e) NULL)
if (is.null(a)) {
message(glue(" ERROR: could not read {grp}"))
next
}
# check for required fields
if (!"sci_name" %in% names(a)) {
message(glue(" SKIP: no sci_name field in {grp}"))
next
}
# get geometry type per row
geom_types <- st_geometry_type(a) |> as.character()
d_grp <- a |>
st_drop_geometry() |>
mutate(
geom_type = geom_types,
grp = grp
) |>
select(grp, sci_name, any_of(c("marine", "id_no")), geom_type)
d_geom_inventory <- bind_rows(d_geom_inventory, d_grp)
}
# summarize geometry types per species per group
d_geom_summary <- d_geom_inventory |>
group_by(grp, sci_name) |>
summarize(
n_rows = n(),
geom_types = paste(sort(unique(geom_type)), collapse = ", "),
has_curved = any(
geom_type %in%
c(
"MULTISURFACE",
"COMPOUNDCURVE",
"CURVEPOLYGON",
"CIRCULARSTRING"
)
),
.groups = "drop"
)
# show species with curved geometries
d_curved <- d_geom_summary |>
filter(has_curved)
datatable(
d_curved,
caption = glue(
"{nrow(d_curved)} species with curved geometries across all IUCN shapefiles"
),
filter = "top",
options = list(dom = "ft", pageLength = 50)
)
```
### All geometry types found
```{r}
#| label: geom_types_summary
d_geom_inventory |>
count(grp, geom_type, name = "n_rows") |>
arrange(grp, geom_type) |>
datatable(
caption = "geometry types per shapefile group",
filter = "top",
options = list(dom = "ft", pageLength = 50)
)
```
## 2. Compare Shapefiles to Database
Identify species present in shapefiles (after marine filter + taxon name match)
but missing from the rng_iucn dataset in the database.
```{r}
#| label: compare_to_db
# species with rng_iucn in database
d_rng_iucn_db <- tbl(con_sdm, "taxon_model") |>
filter(ds_key == "rng_iucn") |>
collect() |>
left_join(
d_taxon |> select(taxon_id, scientific_name),
by = "taxon_id"
)
# species in shapefiles that match taxon table and have marine = true
d_shp_marine <- d_geom_inventory |>
filter(str_to_lower(marine) %in% c("true", "t")) |>
inner_join(
d_taxon |> select(taxon_id, scientific_name),
by = c("sci_name" = "scientific_name")
) |>
distinct(
grp,
sci_name,
taxon_id,
has_curved = geom_type %in%
c(
"MULTISURFACE",
"COMPOUNDCURVE",
"CURVEPOLYGON",
"CIRCULARSTRING"
)
)
# summarize per species (some may have mixed geometry types across rows)
d_shp_spp <- d_shp_marine |>
group_by(grp, sci_name, taxon_id) |>
summarize(
has_curved = any(has_curved),
.groups = "drop"
)
# find species in shapefiles but NOT in database
d_missing <- d_shp_spp |>
anti_join(
d_rng_iucn_db,
by = "taxon_id"
) |>
left_join(
d_geom_summary |> select(grp, sci_name, geom_types),
by = c("grp", "sci_name")
) |>
arrange(grp, sci_name)
# also check for species that have marine != true but are in taxon table
d_marine_mismatch <- d_geom_inventory |>
filter(!str_to_lower(marine) %in% c("true", "t")) |>
inner_join(
d_taxon |> select(taxon_id, scientific_name),
by = c("sci_name" = "scientific_name")
) |>
anti_join(
d_rng_iucn_db,
by = "taxon_id"
) |>
distinct(grp, sci_name, taxon_id, marine)
datatable(
d_missing,
caption = glue(
"{nrow(d_missing)} species in IUCN shapefiles but missing from rng_iucn database"
),
filter = "top",
options = list(dom = "ft", pageLength = 50)
)
```
### Species excluded by marine filter
```{r}
#| label: marine_filter_excluded
datatable(
d_marine_mismatch,
caption = "species in taxon table but excluded by marine filter (marine != 'true'/'t')",
filter = "top",
options = list(dom = "ft", pageLength = 50)
)
```
## 3. Test Curved Geometry Fix
Apply `fix_curved_geom()` to shapefiles with curved geometries and verify the
missing species become valid.
```{r}
#| label: test_fix
# only test shapefiles that have curved species
grps_with_curved <- d_curved |> distinct(grp) |> pull(grp)
d_fix_results <- tibble()
for (grp in grps_with_curved) {
rng_shp <- rng_shps[str_detect(rng_shps, glue("{grp}\\.shp$"))]
if (length(rng_shp) == 0) {
next
}
message(glue("testing fix for {grp}..."))
a <- read_sf(rng_shp)
# check validity before fix
valid_before <- st_is_valid(a)
# apply fix
a_fixed <- tryCatch(
fix_curved_geom(a),
error = function(e) {
message(glue(" fix_curved_geom() error: {e$message}"))
NULL
}
)
if (is.null(a_fixed)) {
d_fix_results <- bind_rows(
d_fix_results,
tibble(
grp = grp,
n_before = nrow(a),
n_invalid_before = sum(!valid_before),
fix_error = TRUE,
n_after = NA_integer_,
n_invalid_after = NA_integer_
)
)
next
}
# apply st_make_valid on the fixed result
if (any(!st_is_valid(a_fixed))) {
a_fixed <- st_make_valid(a_fixed)
}
valid_after <- st_is_valid(a_fixed)
d_fix_results <- bind_rows(
d_fix_results,
tibble(
grp = grp,
n_before = nrow(a),
n_invalid_before = sum(!valid_before),
fix_error = FALSE,
n_after = nrow(a_fixed),
n_invalid_after = sum(!valid_after)
)
)
}
datatable(
d_fix_results,
caption = "results of fix_curved_geom() on shapefiles with curved geometries",
filter = "top",
options = list(dom = "ft")
)
```
### Verify target species
Check that Phoca vitulina and Sotalia guianensis are now fixable.
```{r}
#| label: verify_targets
target_spp <- c("Phoca vitulina", "Sotalia guianensis")
d_target_status <- d_missing |>
filter(sci_name %in% target_spp)
if (nrow(d_target_status) > 0) {
cat("target species found in missing list:\n")
print(d_target_status)
} else {
cat(
"target species NOT in missing list — they may already be in the database\n"
)
d_rng_iucn_db |>
filter(scientific_name %in% target_spp) |>
print()
}
```
## 4. Summary
```{r}
#| label: summary
# classify missing species by reason
d_missing_classified <- d_missing |>
mutate(
reason = case_when(
has_curved ~ "curved geometry (fixable)",
TRUE ~ "unknown — investigate"
)
)
cat(glue(
"## missing species summary\n\n",
"- total species in IUCN shapefiles (marine + taxon match): {nrow(d_shp_spp)}\n",
"- species with rng_iucn in database: {nrow(d_rng_iucn_db)}\n",
"- **missing from database: {nrow(d_missing)}**\n",
" - curved geometry (fixable): {sum(d_missing$has_curved)}\n",
" - other/unknown: {sum(!d_missing$has_curved)}\n",
"- excluded by marine filter: {nrow(d_marine_mismatch)}\n\n"
))
datatable(
d_missing_classified,
caption = "all missing species with classified reason",
filter = "top",
options = list(dom = "ft", pageLength = 100)
)
```
```{r}
#| label: cleanup
#| include: false
dbDisconnect(con_sdm)
```