---
title: "Merge Models"
subtitle: "Merge species distribution models and produce tabular summaries for score calculation"
format:
html:
code-fold: true
code-tools: true
editor_options:
chunk_output_type: console
---
## Overview (`v4b`: `is_turtle` = TRUE)
Merge species distribution models from multiple datasets, flag valid taxa
(`is_ok`), and produce tabular summaries as input to `calc_scores.qmd`.
Data preparation steps (taxonomic ID resolution, ad-hoc species fixes,
dataset ingestion) are in `merge_models_prep.qmd`.
**`v4b`**. For turtles, this workflow implements a "multiplicative merge" approach that integrates the SWOT+DPS ER surface with suitability surfaces from other datasets, while still allowing critical habitat cells to override via max. For non-turtles, the workflow implements a "max merge" approach that simply takes the maximum value across all datasets for each cell, with masking based on global mask priority when applicable. The max merge approach is unchanged from previous versions.
```{r}
#| label: setup
librarian::shelf(
DBI,
dplyr,
DT,
duckdb,
glue,
here,
knitr,
readr,
stringr,
tibble,
tidyr,
quiet = T
)
options(readr.show_col_types = F)
source(here("libs/paths.R"))
# database connections
con_sdm <- dbConnect(duckdb(dbdir = sdm_db, read_only = F))
# section eval flags (set to TRUE to re-run) ----
do_merge_turtles <- T # multiple R chunks to insert new merged dataset; iterate_merge_ds_mdl takes >1 hr
do_is_ok <- T # flag valid taxa
```
## Add Merged Dataset (`ms_merge`)
### Insert ms_merge dataset row
```{r}
#| label: insert_dataset_merged
#| eval: !expr do_merge_turtles
ds_key <- "ms_merge"
row_dataset <- tibble(
ds_key = !!ds_key,
name_short = glue("Marine Sensitivity Merged Model, {Sys.Date()}"),
name_original = "Marine Sensitivity merged model from multiple inputs",
description = "This dataset is derived from others. When a given taxon has multiple distributions, the maximum value is taken. If an IUCN range exists, then that and any critical habitat (NOAA or FWS) are used to mask the other inputs so that only areas within the IUCN range are considered.",
citation = "",
source_broad = "BOEM",
source_detail = "https://marinesensitivity.org/docs",
regions = "USA",
response_type = "mixed",
taxa_groups = "all taxa",
year_pub = 2025,
date_obs_beg = NA,
date_obs_end = NA,
date_env_beg = NA,
date_env_end = NA,
link_info = "https://github.com/MarineSensitivity",
link_download = NA,
link_metadata = NA,
links_other = NA,
spatial_res_deg = 0.05,
temporal_res = "static"
)
if (dbExistsTable(con_sdm, "dataset")) {
dbExecute(con_sdm, glue("DELETE FROM dataset WHERE ds_key = '{ds_key}'"))
}
dbWriteTable(con_sdm, "dataset", row_dataset, append = TRUE)
# tbl(con_sdm, "dataset")
```
### Dataset metadata
```{r}
#| label: dataset_metadata
#| eval: !expr do_merge_turtles
# add metadata columns to dataset table ----
for (col_def in list(
c("name_display", "VARCHAR"),
c("value_info", "VARCHAR"),
c("is_mask", "BOOLEAN"),
c("sort_order", "INTEGER"),
c("global_mask_priority", "DECIMAL")
)) {
tryCatch(
dbExecute(
con_sdm,
glue(
"ALTER TABLE dataset ADD COLUMN IF NOT EXISTS {col_def[1]} {col_def[2]}"
)
),
error = function(e) {
if (!grepl("already exists", e$message, ignore.case = TRUE)) {
stop(e)
}
}
)
}
# populate metadata for each dataset
metadata <- tribble(
~ds_key , ~name_display , ~value_info , ~is_mask , ~sort_order , ~global_mask_priority ,
"am_0.05" , "AquaMaps SDM" , NA , FALSE , 1L , NA ,
"ca_nmfs" , "NMFS Core Area" , "Core: 100" , TRUE , 2L , NA ,
"ch_nmfs" , "NMFS Critical Habitat" , "EN:100, TN:50" , TRUE , 3L , NA ,
"ch_fws" , "FWS Critical Habitat" , "EN:100, TN:50" , TRUE , 4L , NA ,
"rng_fws" , "FWS Range" , "EN:100, TN:50, LC:1" , TRUE , 5L , NA ,
"bl" , "BirdLife Range" , "CR:50, EN:25, VU:5, NT:2, LC:1, DD:1" , TRUE , 6L , 2.0 ,
"rng_iucn" , "IUCN Range" , "CR:50, EN:25, VU:5, NT:2, LC:1, DD:1" , TRUE , 7L , 2.0 ,
"rng_turtle_swot_dps" , "SWOT+DPS Turtle Range" , "EN:100, TN:50" , TRUE , 8L , 1.0 ,
"ms_merge" , "Merged Model" , NA , FALSE , 0L , NA
)
for (i in 1:nrow(metadata)) {
m <- metadata[i, ]
vi <- ifelse(is.na(m$value_info), "NULL", glue("'{m$value_info}'"))
gmp <- ifelse(is.na(m$global_mask_priority), "NULL", m$global_mask_priority)
dbExecute(
con_sdm,
glue(
"UPDATE dataset
SET name_display = '{m$name_display}',
value_info = {vi},
is_mask = {tolower(m$is_mask)},
sort_order = {m$sort_order},
global_mask_priority = {gmp}
WHERE ds_key = '{m$ds_key}'"
)
)
}
tbl(con_sdm, "dataset") |>
select(ds_key, name_display, is_mask, sort_order) |>
collect()
```
### Update turtle rng_iucn values
```{r}
#| label: update_turtle_rng_iucn
#| eval: !expr do_merge_turtles
# TODO: systematically reconcile rng_iucn values with latest IUCN Red List API
# assessments (rredlist R package). IUCN shapefile `category` can lag behind
# the current website assessment. Known mismatches beyond turtles include:
# - Chinook Salmon: shapefile LC(1) vs latest IUCN assessment TBD
# - Killer Whale: shapefile DD(1) vs latest IUCN assessment TBD
# - Beluga Whale: shapefile LC(1) vs latest IUCN assessment TBD
# - Humpback Whale: shapefile LC(1) vs latest IUCN assessment TBD
# - Steller Sea Lion: shapefile NT(2) vs latest IUCN assessment TBD
# - False Killer Whale: shapefile NT(2) vs latest IUCN assessment TBD
# - Bearded Seal: shapefile NT(2) vs latest IUCN assessment TBD
# - Boulder Star Coral: shapefile NT(2) vs latest IUCN assessment TBD
# - Dolly Varden: shapefile LC(1) vs latest IUCN assessment TBD
# is_turtle: override rng_iucn cell values with latest IUCN website assessments
# for v4c, generalize this to all species using IUCN API
d_turtle_iucn <- tribble(
~scientific_name , ~iucn_code ,
"Chelonia mydas" , "LC" , # iucnredlist.org/species/4615/285108125
"Eretmochelys imbricata" , "CR" , # iucnredlist.org/species/8005/12881238
"Lepidochelys kempii" , "CR" , # iucnredlist.org/species/11533/155057916
"Dermochelys coriacea" , "VU" , # iucnredlist.org/species/6494/43526147
"Caretta caretta" , "VU" , # iucnredlist.org/species/3897/119333622
"Lepidochelys olivacea" , "VU"
) # iucnredlist.org/species/11534/3292503
# get rng_iucn mdl_seq for each turtle taxon
d_turtle_mdl <- tbl(con_sdm, "taxon") |>
filter(sp_cat == "turtle") |>
select(taxon_id, scientific_name) |>
inner_join(
tbl(con_sdm, "taxon_model") |>
filter(ds_key == "rng_iucn"),
by = "taxon_id"
) |>
collect() |>
inner_join(d_turtle_iucn, by = "scientific_name") |>
mutate(
value = msens::compute_er_score(glue("IUCN:{iucn_code}"))
)
# update rng_iucn cell values per turtle species
for (j in 1:nrow(d_turtle_mdl)) {
r <- d_turtle_mdl[j, ]
n <- dbExecute(
con_sdm,
glue(
"UPDATE model_cell SET value = {r$value}
WHERE mdl_seq = {r$mdl_seq}"
)
)
message(glue(
" updated {r$scientific_name} rng_iucn (mdl_seq={r$mdl_seq}): ",
"IUCN:{r$iucn_code} -> value={r$value} ({n} cells)"
))
}
```
### Iterate merge across taxa
```{r}
#| label: iterate_merge_ds_mdl
#| eval: !expr do_merge_turtles
ds_key <- "ms_merge"
# cell_tif <- glue("{dir_data}/derived/r_bio-oracle_planarea.tif")
# r_cell <- rast(cell_tif)
# ext(r_cell) <- round(ext(r_cell), 3)
d_x <- tbl(con_sdm, "taxon") |>
filter(is_ok) |>
filter(sp_cat == "turtle") |>
arrange(desc(n_ds), taxon_id) |>
collect()
# removed: quick fix for turtle sp_cat — now handled by reclassify_reptiles
# chunk after taxon table creation (bind_birds_notbirds_ds)
# table(d_x$sp_cat, useNA = "ifany")
# OLD:
# bird coral fish invertebrate mammal reptile
# 49 10 12 1 12 7
# NEW (rng_iucn):
# coral fish invertebrate mammal other turtle
# 375 927 148 58 2 6
ds_keys <- tbl(con_sdm, "dataset") |>
pull(ds_key) |>
setdiff("ms_merge")
# datasets for extracting max value SDM
# ds_keys_sdm <- setdiff(ds_keys, "rng_iucn") # OLD: excluding IUCN range map except for masking
ds_keys_sdm <- ds_keys # NEW: include IUCN range map for max value SDM
# datasets that form the combined mask (IUCN range + critical habitats + national range), only when global mask exists for species
ds_keys_mask <- tbl(con_sdm, "dataset") |>
filter(is_mask) |>
pull(ds_key)
# datasets with global_mask_priority (lower = higher priority)
ds_global_masks <- tbl(con_sdm, "dataset") |>
filter(!is.na(global_mask_priority)) |>
select(ds_key, global_mask_priority) |>
collect() |>
arrange(global_mask_priority)
t_0 <- Sys.time()
for (i in 1:nrow(d_x)) {
# i = 1 # which(str_detect(d_x$scientific_name, ".*ricei")) # i = 1512
d_sp <- d_x |> slice(i)
is_turtle <- d_sp$sp_cat == "turtle"
# d_sp |> glimpse()
# Rows: 1
# Columns: 18
# $ taxon_id <dbl> 127186
# $ taxon_authority <chr> "worms"
# $ n_ds <int> 5
# $ am_0.05 <int> 7466
# $ ch_nmfs <int> 18230
# $ ch_fws <int> 18309
# $ rng_fws <int> 18401
# $ sp_cat <chr> "fish"
# $ bl <int> NA
# $ mdl_seq <int> NA
# $ scientific_name <chr> "Salmo salar"
# $ common_name <chr> "silver salmon"
# $ redlist_code <chr> "EN"
# $ worms_is_marine <lgl> TRUE
# $ worms_is_extinct <lgl> NA
# $ worms_id <dbl> 127186
# $ is_ok <lgl> TRUE
# $ rng_iucn <int> 19445
# list species models by dataset from taxon_model junction table
d_sp_l <- tbl(con_sdm, "taxon_model") |>
filter(taxon_id == !!d_sp$taxon_id) |>
collect() |>
mutate(
taxon_authority = d_sp$taxon_authority
)
# taxon_id taxon_authority ds_key mdl_seq
# <dbl> <chr> <chr> <int>
# 1 127186 worms am_0.05 7466
# 2 127186 worms ch_fws 18309
# 3 127186 worms ch_nmfs 18230
# 4 127186 worms rng_fws 18401
# 5 127186 worms rng_iucn 19445
# NEW ds_key: "ms_merge"; mdl_seq: 20030
d_r_ds <- tbl(con_sdm, "model_cell") |>
filter(
mdl_seq %in% d_sp_l$mdl_seq
) |>
left_join(
tbl(con_sdm, "model") |>
select(mdl_seq, ds_key) |>
filter(mdl_seq %in% d_sp_l$mdl_seq),
by = "mdl_seq"
) |>
group_by(ds_key) |>
summarize(
n_cell = n(),
v_min = min(value, na.rm = T),
v_max = max(value, na.rm = T),
.groups = "drop"
) |>
collect() |>
mutate(
ds_str = glue("{ds_key} ({n_cell} cells, {v_min} - {v_max})")
)
# ds_key n_cell v_min v_max ds_str
# <chr> <dbl> <int> <int> <glue>
# 1 rng_iucn 6245 50 50 rng_iucn (6245 cells, 50 - 50)
# 2 ch_fws 29 90 90 ch_fws (29 cells, 90 - 90)
# 3 am_0.05 11739 1 100 am_0.05 (11739 cells, 1 - 100)
# 4 ch_nmfs 3 90 90 ch_nmfs (3 cells, 90 - 90)
# 5 rng_fws 264 50 90 rng_fws (264 cells, 50 - 90)
if (is_turtle) {
# turtle multiplicative merge (v4b) ----
# merged_value = pmax(1, round(er_value * suit_value / 100))
# critical habitat cells override via max
# suitability surface: non-mask SDM datasets (e.g., am_0.05)
ds_suit_sp <- intersect(d_sp_l$ds_key, setdiff(ds_keys_sdm, ds_keys_mask))
# ER surface: SWOT+DPS
ds_er_sp <- intersect(d_sp_l$ds_key, "rng_turtle_swot_dps")
# critical habitat datasets
ds_ch_sp <- intersect(d_sp_l$ds_key, c("ch_nmfs", "ch_fws", "ca_nmfs"))
# all datasets used (for description)
ds_sdm_sp <- unique(c(ds_suit_sp, ds_er_sp, ds_ch_sp))
if (length(ds_er_sp) > 0) {
er_mdl_seqs <- d_sp_l |>
filter(ds_key %in% ds_er_sp) |>
pull(mdl_seq)
d_er <- tbl(con_sdm, "model_cell") |>
filter(mdl_seq %in% er_mdl_seqs) |>
select(cell_id, er_value = value) |>
collect()
if (length(ds_suit_sp) > 0) {
suit_mdl_seqs <- d_sp_l |>
filter(ds_key %in% ds_suit_sp) |>
pull(mdl_seq)
d_suit <- tbl(con_sdm, "model_cell") |>
filter(mdl_seq %in% suit_mdl_seqs) |>
group_by(cell_id) |>
summarize(suit_value = max(value, na.rm = T), .groups = "drop") |>
collect()
d_r_sp <- d_er |>
left_join(d_suit, by = "cell_id") |>
mutate(
suit_value = coalesce(suit_value, 1L),
value = pmax(
1L,
as.integer(round(
er_value * suit_value / 100
))
)
) |>
select(cell_id, value)
} else {
# no suitability: suit=1 everywhere
d_r_sp <- d_er |>
mutate(
value = pmax(1L, as.integer(round(er_value * 1L / 100)))
) |>
select(cell_id, value)
}
} else {
# no SWOT+DPS (unlikely): fall back to max of all SDMs
sdm_mdl_seqs <- d_sp_l |>
filter(ds_key %in% intersect(d_sp_l$ds_key, ds_keys_sdm)) |>
pull(mdl_seq)
d_r_sp <- tbl(con_sdm, "model_cell") |>
filter(mdl_seq %in% sdm_mdl_seqs) |>
group_by(cell_id) |>
summarize(value = max(value, na.rm = T), .groups = "drop") |>
collect()
}
# union critical habitat cells (override via max)
if (length(ds_ch_sp) > 0) {
ch_mdl_seqs <- d_sp_l |>
filter(ds_key %in% ds_ch_sp) |>
pull(mdl_seq)
d_ch <- tbl(con_sdm, "model_cell") |>
filter(mdl_seq %in% ch_mdl_seqs) |>
group_by(cell_id) |>
summarize(value = max(value, na.rm = T), .groups = "drop") |>
collect()
d_r_sp <- bind_rows(d_r_sp, d_ch) |>
group_by(cell_id) |>
summarize(value = max(value, na.rm = T), .groups = "drop")
}
has_global_mask <- FALSE
has_other_sdm <- FALSE
} else {
# non-turtle max merge (v4 approach, unchanged) ----
# datasets used for SDM
ds_sdm_sp <- intersect(d_sp_l$ds_key, ds_keys_sdm)
sdm_mdl_seqs <- d_sp_l |>
filter(ds_key %in% ds_sdm_sp) |>
pull(mdl_seq)
# query of SDM with max value across all datasets
q_sdm <- tbl(con_sdm, "model_cell") |>
filter(
mdl_seq %in% sdm_mdl_seqs
) |>
group_by(cell_id) |>
summarize(value = max(value, na.rm = T), .groups = "drop")
# find highest-priority global mask this species has
sp_global_masks <- d_sp_l |>
inner_join(ds_global_masks, by = "ds_key") |>
arrange(global_mask_priority)
has_global_mask <- nrow(sp_global_masks) > 0
# only mask if species has at least one SDM dataset besides the global mask
# (avoid self-intersection, e.g. birds with only BirdLife)
has_other_sdm <- FALSE
if (has_global_mask) {
best_mask_ds <- sp_global_masks$ds_key[1]
other_sdm_ds <- setdiff(ds_sdm_sp, best_mask_ds)
has_other_sdm <- length(other_sdm_ds) > 0
}
if (has_global_mask && has_other_sdm) {
# get datasets used for mask
# exclude lower-priority global masks from the mask union
lower_global_masks <- sp_global_masks |>
filter(ds_key != best_mask_ds) |>
pull(ds_key)
ds_mask_sp <- setdiff(
intersect(d_sp_l$ds_key, ds_keys_mask),
lower_global_masks
)
# get mdl_seqs for masking datasets
mask_mdl_seqs <- d_sp_l |>
filter(ds_key %in% ds_mask_sp) |>
pull(mdl_seq)
q_mask <- tbl(con_sdm, "model_cell") |>
filter(mdl_seq %in% mask_mdl_seqs) |>
distinct(cell_id) |>
mutate(value = 1)
# get SDM cells with mask applied
d_r_sp <- q_sdm |>
semi_join(
q_mask,
by = join_by(cell_id)
) |>
collect()
} else {
# no global mask or no other SDM to mask — just max value of all SDMs
d_r_sp <- collect(q_sdm)
}
}
# apply MMPA/MBTA spatial minimum floors ----
if (isTRUE(d_sp$is_mmpa)) {
d_r_sp <- d_r_sp |>
mutate(value = pmax(value, 20L))
}
if (isTRUE(d_sp$is_mbta)) {
d_r_sp <- d_r_sp |>
mutate(value = pmax(value, 10L))
}
sp_sci <- d_sp$scientific_name
sp_cmn <- d_sp$common_name
sp_cat <- d_sp$sp_cat
sp_key <- glue("{ds_key}:{d_sp$taxon_id}")
# show message every 50 iterations or for the first 5 iterations
if (i <= 5 || i %% 50 == 0) {
eta <- Sys.time() + round((Sys.time() - t_0) / i * (nrow(d_x) - i))
message(glue(
"Processing {i}/{nrow(d_x)} [{sp_key}] ({sp_cat}): {sp_cmn} (_{sp_sci}_) ~ ETA: {eta}"
))
# started ~15:00
}
n_cells <- nrow(d_r_sp)
if (n_cells == 0) {
message(glue("No cells for {sp_sci} ({i}/{nrow(d_x)})"))
next()
}
# delete: existing ----
mdl_seqs <- tbl(con_sdm, "model") |>
filter(ds_key == !!ds_key, taxa == !!sp_key) |>
pull(mdl_seq)
if (length(mdl_seqs) > 0) {
dbExecute(
con_sdm,
glue(
"DELETE FROM taxon_model WHERE ds_key = '{ds_key}' AND taxon_id = {d_sp$taxon_id}"
)
)
dbExecute(
con_sdm,
glue("DELETE FROM model WHERE ds_key = '{ds_key}' AND taxa = '{sp_key}'")
)
dbExecute(
con_sdm,
glue(
"DELETE FROM species WHERE ds_key = '{ds_key}' AND taxa = '{sp_key}'"
)
)
dbExecute(
con_sdm,
glue(
"DELETE FROM model_cell WHERE mdl_seq IN ({paste(mdl_seqs, collapse = ',')})"
)
)
dbExecute(
con_sdm,
glue(
"UPDATE taxon SET mdl_seq = NULL WHERE mdl_seq IN ({paste(mdl_seqs, collapse = ',')})"
)
)
}
# append: model ----
has_mask_str <- if (has_global_mask && has_other_sdm) {
glue(
"; Mask applied (global mask: {best_mask_ds}) from combining datasets: {paste(ds_mask_sp, collapse = ', ')}"
)
} else {
""
}
d_model <- tibble(
ds_key = ds_key,
taxa = sp_key,
time_period = "2025",
region = "USA",
mdl_type = "mixed",
description = if (is_turtle) {
glue(
"Marine Sensitivity merged model with multiplicative merge ",
"(er * suit / 100) from datasets: {paste(ds_sdm_sp, collapse = ', ')}"
)
} else {
glue(
"Marine Sensitivity merged model with max values from datasets: ",
"{paste(ds_sdm_sp, collapse = ', ')}{has_mask_str}"
)
}
)
dbWriteTable(con_sdm, "model", d_model, append = TRUE)
# tbl(con_sdm, "model") |> collect() |> tail()
# get the mdl_seq that was just created
mdl_seq <- dbGetQuery(
con_sdm,
glue(
"
SELECT mdl_seq FROM model
WHERE
ds_key = '{ds_key}' AND
taxa = '{sp_key}'
ORDER BY mdl_seq DESC LIMIT 1 "
)
)$mdl_seq
# d_sp |> glimpse()
# Rows: 1
# Columns: 15
# $ taxon_id <dbl> 127186
# $ taxon_authority <chr> "worms"
# $ n_ds <int> 4
# $ am_0.05 <int> 7466
# $ ch_nmfs <int> 18230
# $ ch_fws <int> 18309
# $ rng_fws <int> 18401
# $ sp_cat <chr> "fish"
# $ bl <int> NA
# $ mdl_seq <int> NA
# $ scientific_name <chr> "Salmo salar"
# $ common_name <chr> "silver salmon"
# $ redlist_code <chr> "EN"
# $ worms_is_marine <lgl> TRUE
# $ worms_is_extinct <lgl> NA
# append: species ----
d_species <- tibble(
ds_key = ds_key,
taxa = sp_key,
sp_key = sp_key,
worms_id = ifelse(
d_sp$taxon_authority == "worms",
d_sp$taxon_id,
NA_integer_
),
botw_id = ifelse(
d_sp$taxon_authority == "botw",
d_sp$taxon_id,
NA_integer_
),
gbif_id = NA_integer_,
itis_id = NA_integer_,
iucn_id = NA_integer_,
scientific_name_dataset = sp_sci,
common_name_dataset = sp_cmn,
scientific_name_accepted = sp_sci,
common_name_accepted = sp_cmn,
redlist_code = d_sp$redlist_code,
redlist_year = NA_integer_,
sp_cat = sp_cat,
worms_is_marine = d_sp$worms_is_marine,
worms_is_extinct = d_sp$worms_is_extinct
)
# d_species |> glimpse()
stopifnot(
length(setdiff(names(d_species), dbListFields(con_sdm, "species"))) == 0
)
stopifnot(
setdiff(dbListFields(con_sdm, "species"), names(d_species)) == "sp_seq"
)
dbWriteTable(con_sdm, "species", d_species, append = T)
# tbl(con_sdm, "species") |> collect() |> tail()
# tbl(con_sdm, "model") |> collect() |> tail()
stopifnot(
tbl(con_sdm, "species") |> filter(is.na(sp_seq)) |> collect() |> nrow() == 0
)
# append: model_cell ----
d_mdl_cell <- d_r_sp |>
mutate(
mdl_seq = mdl_seq
) |>
arrange(cell_id)
dbWriteTable(con_sdm, "model_cell", d_mdl_cell, append = T)
# update taxon with mdl_seq
dbExecute(
con_sdm,
glue(
"
UPDATE taxon
SET mdl_seq = {mdl_seq}
WHERE taxon_id = {d_sp$taxon_id}"
)
)
# append ms_merge row to taxon_model junction table
dbWriteTable(
con_sdm,
"taxon_model",
tibble(
taxon_id = d_sp$taxon_id,
ds_key = "ms_merge",
mdl_seq = as.integer(mdl_seq)
),
append = TRUE
)
}
# Salmo salar silver salmon
```
## Set taxon.is_ok
Add `is_ok`: a simple logical field for flagging valid taxa, ie (so far):
- birds:
- [x] add worms_id (if available)
- `redlist_code` != "EX"
- has a `botw_id`
- if has worms_id:
- `worms_is_marine` != F
- `worms_is_extinct` != T
- no model cells overlap with Program Areas
- category is "reptile" but not "turtle" (handled by reclassify_reptiles)
- not birds:
- has a `worms_id`
- `worms_is_marine` != F
- `worms_is_extinct` != T
### Flag valid taxa
```{r}
#| label: taxon_is_ok
#| eval: !expr do_is_ok
d <- tbl(con_sdm, "taxon") |>
collect() # 17,561 × 16
# birds ----
# d |>
# filter(
# taxon_authority == "botw") |>
# pull(redlist_code) |>
# table(useNA = "ifany")
# CR DD EN LC NT TN VU
# 3 1 58 452 45 14 41
#
# d |>
# filter(
# taxon_authority == "botw") |>
# select(worms_is_marine, worms_is_extinct) |>
# table(useNA = "ifany")
# worms_is_extinct
# worms_is_marine <NA>
# FALSE 118
# TRUE 221
# <NA> 275
d_b <- d |>
filter(
taxon_authority == "botw"
) |>
mutate(
is_ok = case_when(
is.na(taxon_id) ~ F,
is.na(mdl_seq) ~ F,
!is.na(redlist_code) & redlist_code == "EX" ~ F, # 0
!is.na(worms_id) & worms_is_marine == F ~ F, # 118
!is.na(worms_id) & worms_is_extinct == T ~ F, # 0
.default = T
)
)
# d_b$is_ok |> table(useNA = "ifany")
# FALSE TRUE
# 118 496
# worms ----
# d |>
# filter(
# taxon_authority == "worms") |>
# pull(redlist_code) |>
# table(useNA = "ifany")
# CR DD EN EX LC NT TN VU <NA>
# 47 340 235 1 5574 107 11 154 10478
#
# d |>
# filter(
# taxon_authority == "worms") |>
# select(worms_is_marine, worms_is_extinct) |>
# table(useNA = "ifany")
# worms_is_extinct
# worms_is_marine FALSE TRUE <NA>
# FALSE 19 0 24
# TRUE 4171 20 12708
# <NA> 0 0 5
d_w <- d |>
filter(
taxon_authority == "worms"
) |>
mutate(
is_ok = case_when(
is.na(taxon_id) ~ F,
is.na(mdl_seq) ~ F,
!is.na(redlist_code) & redlist_code == "EX" ~ F,
!is.na(worms_id) & worms_is_marine == F ~ F,
!is.na(worms_id) & worms_is_extinct == T ~ F,
!is.na(worms_taxonomic_status) &
!worms_taxonomic_status %in%
c("accepted", "alternative representation") ~ F,
sp_cat == "reptile" ~ F,
sp_cat == "turtle" ~ T,
.default = T
)
)
# d_w$is_ok |> table(useNA = "ifany")
# FALSE TRUE
# 64 16,883
d2 <- bind_rows(
d_b,
d_w
) |>
select(taxon_id, is_ok)
# flag taxa with no distribution inside program areas ----
# (mirrors r_mask approach in apps_2026/mapsp/app.R but with DB queries)
taxa_in_pra <- tbl(con_sdm, "taxon_model") |>
inner_join(
tbl(con_sdm, "model_cell") |> select(mdl_seq, cell_id),
by = "mdl_seq"
) |>
inner_join(
tbl(con_sdm, "zone_cell") |> select(zone_seq, cell_id),
by = "cell_id"
) |>
inner_join(
tbl(con_sdm, "zone") |>
filter(
fld == "programarea_key"
) |>
select(zone_seq),
by = "zone_seq"
) |>
distinct(taxon_id) |>
pull(taxon_id)
n_outside <- sum(d2$is_ok & !(d2$taxon_id %in% taxa_in_pra))
message(glue(
"{n_outside} taxa flagged is_ok=F (no distribution in program areas)"
))
d2 <- d2 |>
mutate(
is_ok = is_ok & taxon_id %in% taxa_in_pra
)
stopifnot(sum(duplicated(d2$taxon_id)) == 0)
dbExecute(
con_sdm,
"ALTER TABLE taxon ADD COLUMN IF NOT EXISTS is_ok BOOLEAN"
)
duckdb_register(con_sdm, "d2", d2)
dbExecute(
con_sdm,
"UPDATE taxon
SET is_ok = d2.is_ok
FROM d2
WHERE taxon.taxon_id = d2.taxon_id"
) # 17,561
duckdb_unregister(con_sdm, "d2")
# add is_er_spatial flag (TRUE for turtles where ER is spatially ----
# differentiated per-cell from mask datasets; FALSE for species using
# uniform species-level scalar er_score)
dbExecute(
con_sdm,
"ALTER TABLE taxon ADD COLUMN IF NOT EXISTS is_er_spatial BOOLEAN"
)
dbExecute(
con_sdm,
"UPDATE taxon SET is_er_spatial = (sp_cat = 'turtle')"
)
```
## Taxon Summary
### Export taxon summary
```{r}
#| label: taxon_summary
taxon_csv <- here("data/taxon.csv")
# dbListFields(con_sdm, "taxon") |> paste(collapse = ", ") |> cat()
d_taxon <- tbl(con_sdm, "taxon") |>
select(
is_ok,
component = sp_cat,
common_name,
scientific_name,
redlist_code_max = redlist_code,
extrisk_code,
er_score,
is_mmpa,
is_mbta,
is_bcc,
worms_is_marine,
worms_is_extinct,
n_datasets = n_ds,
taxon_authority,
taxon_id,
model_id = mdl_seq
) |>
arrange(desc(is_ok), component, common_name) |>
collect()
write_csv(d_taxon, taxon_csv)
```
## Dataset Summary
```{r}
#| label: dataset_overview
tbl(con_sdm, "dataset") |>
select(sort_order, ds_key, name_display, value_info, is_mask) |>
arrange(sort_order) |>
collect() |>
DT::datatable()
```
```{r}
#| label: model_counts_by_ds
tbl(con_sdm, "model") |>
group_by(ds_key) |>
summarize(n_models = n()) |>
arrange(ds_key) |>
collect() |>
DT::datatable()
```
## Species Summary
### Species counts by sp_cat and ds_key
```{r}
#| label: spp_counts_by_cat_ds
tbl(con_sdm, "species") |>
group_by(sp_cat, ds_key) |>
summarize(n = n(), .groups = "drop") |>
arrange(sp_cat, ds_key) |>
collect() |>
DT::datatable()
```
### Taxonomic authority coverage
```{r}
#| label: auth_coverage
tbl(con_sdm, "species") |>
group_by(sp_cat, ds_key) |>
summarize(
n = n(),
n_worms = sum(!is.na(worms_id), na.rm = T),
n_itis = sum(!is.na(itis_id), na.rm = T),
n_gbif = sum(!is.na(gbif_id), na.rm = T),
n_allna = sum(
is.na(worms_id) & is.na(itis_id) & is.na(gbif_id),
na.rm = T
),
.groups = "drop"
) |>
mutate(
pct_worms = round(n_worms / n * 100, 1),
pct_itis = round(n_itis / n * 100, 1),
pct_gbif = round(n_gbif / n * 100, 1),
pct_allna = round(n_allna / n * 100, 1)
) |>
arrange(sp_cat, ds_key) |>
collect() |>
DT::datatable()
```
### WoRMS marine/extinct percentages
```{r}
#| label: worms_marine_extinct
tbl(con_sdm, "species") |>
group_by(sp_cat) |>
summarize(
n = n(),
n_worms_marine = sum(worms_is_marine, na.rm = T),
n_worms_extinct = sum(worms_is_extinct, na.rm = T)
) |>
mutate(
pct_worms_marine = round(n_worms_marine / n * 100, 1),
pct_worms_extinct = round(n_worms_extinct / n * 100, 1)
) |>
collect() |>
DT::datatable()
```
## Taxon Table Summary
### Taxon counts by sp_cat and is_ok
```{r}
#| label: taxon_counts_by_cat_ok
tbl(con_sdm, "taxon") |>
group_by(sp_cat, is_ok) |>
summarize(n = n(), .groups = "drop") |>
collect() |>
pivot_wider(
names_from = is_ok,
values_from = n,
names_prefix = "is_ok_"
) |>
DT::datatable()
```
### Redlist code distribution
```{r}
#| label: redlist_distribution
tbl(con_sdm, "taxon") |>
filter(is_ok) |>
group_by(sp_cat, redlist_code) |>
summarize(n = n(), .groups = "drop") |>
collect() |>
pivot_wider(
names_from = redlist_code,
values_from = n,
values_fill = 0
) |>
DT::datatable()
```
### Extinction risk authority summary
```{r}
#| label: extrisk_authority
tbl(con_sdm, "taxon") |>
filter(is_ok) |>
mutate(
authority = case_when(
str_starts(extrisk_code, "NMFS") ~ "NMFS",
str_starts(extrisk_code, "FWS") ~ "FWS",
str_starts(extrisk_code, "IUCN") ~ "IUCN",
TRUE ~ "none"
)
) |>
count(authority) |>
collect() |>
DT::datatable()
```
### Number of datasets per taxon
```{r}
#| label: n_ds_distribution
tbl(con_sdm, "taxon") |>
filter(is_ok) |>
count(n_ds) |>
collect() |>
DT::datatable()
```
## Taxon x Dataset Matrix
```{r}
#| label: taxon_dataset_matrix
tbl(con_sdm, "taxon_model") |>
inner_join(
tbl(con_sdm, "taxon") |>
filter(is_ok) |>
select(taxon_id, sp_cat),
by = "taxon_id"
) |>
group_by(sp_cat, ds_key) |>
summarize(n = n(), .groups = "drop") |>
collect() |>
pivot_wider(
names_from = ds_key,
values_from = n,
values_fill = 0
) |>
DT::datatable()
```
## Taxon Detail Table
```{r}
#| label: taxon_detail
tbl(con_sdm, "taxon") |>
filter(is_ok) |>
select(
sp_cat,
scientific_name,
common_name,
redlist_code,
extrisk_code,
er_score,
n_ds,
taxon_id,
taxon_authority
) |>
arrange(sp_cat, scientific_name) |>
collect() |>
DT::datatable(
filter = "top",
options = list(pageLength = 25)
)
```