Merge species distribution models and produce tabular summaries for score calculation

Published

2026-03-17 16:01:04

1 Overview (v4b: is_turtle = TRUE)

Merge species distribution models from multiple datasets, flag valid taxa (is_ok), and produce tabular summaries as input to calc_scores.qmd.

Data preparation steps (taxonomic ID resolution, ad-hoc species fixes, dataset ingestion) are in merge_models_prep.qmd.

v4b. For turtles, this workflow implements a “multiplicative merge” approach that integrates the SWOT+DPS ER surface with suitability surfaces from other datasets, while still allowing critical habitat cells to override via max. For non-turtles, the workflow implements a “max merge” approach that simply takes the maximum value across all datasets for each cell, with masking based on global mask priority when applicable. The max merge approach is unchanged from previous versions.

Code
librarian::shelf(
  DBI,
  dplyr,
  DT,
  duckdb,
  glue,
  here,
  knitr,
  readr,
  stringr,
  tibble,
  tidyr,
  quiet = T
)
options(readr.show_col_types = F)

source(here("libs/paths.R"))

# database connections
con_sdm <- dbConnect(duckdb(dbdir = sdm_db, read_only = F))

# section eval flags (set to TRUE to re-run) ----
do_merge_turtles <- T # multiple R chunks to insert new merged dataset; iterate_merge_ds_mdl takes >1 hr
do_is_ok <- T # flag valid taxa

2 Add Merged Dataset (ms_merge)

2.1 Insert ms_merge dataset row

Code
ds_key <- "ms_merge"
row_dataset <- tibble(
  ds_key = !!ds_key,
  name_short = glue("Marine Sensitivity Merged Model, {Sys.Date()}"),
  name_original = "Marine Sensitivity merged model from multiple inputs",
  description = "This dataset is derived from others. When a given taxon has multiple distributions, the maximum value is taken. If an IUCN range exists, then that and any critical habitat (NOAA or FWS) are used to mask the other inputs so that only areas within the IUCN range are considered.",
  citation = "",
  source_broad = "BOEM",
  source_detail = "https://marinesensitivity.org/docs",
  regions = "USA",
  response_type = "mixed",
  taxa_groups = "all taxa",
  year_pub = 2025,
  date_obs_beg = NA,
  date_obs_end = NA,
  date_env_beg = NA,
  date_env_end = NA,
  link_info = "https://github.com/MarineSensitivity",
  link_download = NA,
  link_metadata = NA,
  links_other = NA,
  spatial_res_deg = 0.05,
  temporal_res = "static"
)

if (dbExistsTable(con_sdm, "dataset")) {
  dbExecute(con_sdm, glue("DELETE FROM dataset WHERE ds_key = '{ds_key}'"))
}
[1] 1
Code
dbWriteTable(con_sdm, "dataset", row_dataset, append = TRUE)
# tbl(con_sdm, "dataset")

2.2 Dataset metadata

Code
# add metadata columns to dataset table ----
for (col_def in list(
  c("name_display", "VARCHAR"),
  c("value_info", "VARCHAR"),
  c("is_mask", "BOOLEAN"),
  c("sort_order", "INTEGER"),
  c("global_mask_priority", "DECIMAL")
)) {
  tryCatch(
    dbExecute(
      con_sdm,
      glue(
        "ALTER TABLE dataset ADD COLUMN IF NOT EXISTS {col_def[1]} {col_def[2]}"
      )
    ),
    error = function(e) {
      if (!grepl("already exists", e$message, ignore.case = TRUE)) {
        stop(e)
      }
    }
  )
}

# populate metadata for each dataset
metadata <- tribble(
  ~ds_key               , ~name_display           , ~value_info                            , ~is_mask , ~sort_order , ~global_mask_priority ,
  "am_0.05"             , "AquaMaps SDM"          , NA                                     , FALSE    , 1L          , NA                    ,
  "ca_nmfs"             , "NMFS Core Area"        , "Core: 100"                            , TRUE     , 2L          , NA                    ,
  "ch_nmfs"             , "NMFS Critical Habitat" , "EN:100, TN:50"                        , TRUE     , 3L          , NA                    ,
  "ch_fws"              , "FWS Critical Habitat"  , "EN:100, TN:50"                        , TRUE     , 4L          , NA                    ,
  "rng_fws"             , "FWS Range"             , "EN:100, TN:50, LC:1"                  , TRUE     , 5L          , NA                    ,
  "bl"                  , "BirdLife Range"        , "CR:50, EN:25, VU:5, NT:2, LC:1, DD:1" , TRUE     , 6L          , 2.0                   ,
  "rng_iucn"            , "IUCN Range"            , "CR:50, EN:25, VU:5, NT:2, LC:1, DD:1" , TRUE     , 7L          , 2.0                   ,
  "rng_turtle_swot_dps" , "SWOT+DPS Turtle Range" , "EN:100, TN:50"                        , TRUE     , 8L          , 1.0                   ,
  "ms_merge"            , "Merged Model"          , NA                                     , FALSE    , 0L          , NA
)

for (i in 1:nrow(metadata)) {
  m <- metadata[i, ]
  vi <- ifelse(is.na(m$value_info), "NULL", glue("'{m$value_info}'"))
  gmp <- ifelse(is.na(m$global_mask_priority), "NULL", m$global_mask_priority)
  dbExecute(
    con_sdm,
    glue(
      "UPDATE dataset
       SET name_display         = '{m$name_display}',
           value_info           = {vi},
           is_mask              = {tolower(m$is_mask)},
           sort_order           = {m$sort_order},
           global_mask_priority = {gmp}
       WHERE ds_key = '{m$ds_key}'"
    )
  )
}

tbl(con_sdm, "dataset") |>
  select(ds_key, name_display, is_mask, sort_order) |>
  collect()
# A tibble: 9 × 4
  ds_key              name_display          is_mask sort_order
  <chr>               <chr>                 <lgl>        <int>
1 am_0.05             AquaMaps SDM          FALSE            1
2 bl                  BirdLife Range        TRUE             6
3 ca_nmfs             NMFS Core Area        TRUE             2
4 ch_fws              FWS Critical Habitat  TRUE             4
5 ch_nmfs             NMFS Critical Habitat TRUE             3
6 rng_fws             FWS Range             TRUE             5
7 rng_iucn            IUCN Range            TRUE             7
8 rng_turtle_swot_dps SWOT+DPS Turtle Range TRUE             8
9 ms_merge            Merged Model          FALSE            0

2.3 Update turtle rng_iucn values

Code
# TODO: systematically reconcile rng_iucn values with latest IUCN Red List API
#   assessments (rredlist R package). IUCN shapefile `category` can lag behind
#   the current website assessment. Known mismatches beyond turtles include:
#   - Chinook Salmon: shapefile LC(1) vs latest IUCN assessment TBD
#   - Killer Whale: shapefile DD(1) vs latest IUCN assessment TBD
#   - Beluga Whale: shapefile LC(1) vs latest IUCN assessment TBD
#   - Humpback Whale: shapefile LC(1) vs latest IUCN assessment TBD
#   - Steller Sea Lion: shapefile NT(2) vs latest IUCN assessment TBD
#   - False Killer Whale: shapefile NT(2) vs latest IUCN assessment TBD
#   - Bearded Seal: shapefile NT(2) vs latest IUCN assessment TBD
#   - Boulder Star Coral: shapefile NT(2) vs latest IUCN assessment TBD
#   - Dolly Varden: shapefile LC(1) vs latest IUCN assessment TBD

# is_turtle: override rng_iucn cell values with latest IUCN website assessments
#   for v4c, generalize this to all species using IUCN API
d_turtle_iucn <- tribble(
  ~scientific_name         , ~iucn_code ,
  "Chelonia mydas"         , "LC"       , # iucnredlist.org/species/4615/285108125
  "Eretmochelys imbricata" , "CR"       , # iucnredlist.org/species/8005/12881238
  "Lepidochelys kempii"    , "CR"       , # iucnredlist.org/species/11533/155057916
  "Dermochelys coriacea"   , "VU"       , # iucnredlist.org/species/6494/43526147
  "Caretta caretta"        , "VU"       , # iucnredlist.org/species/3897/119333622
  "Lepidochelys olivacea"  , "VU"
) # iucnredlist.org/species/11534/3292503

# get rng_iucn mdl_seq for each turtle taxon
d_turtle_mdl <- tbl(con_sdm, "taxon") |>
  filter(sp_cat == "turtle") |>
  select(taxon_id, scientific_name) |>
  inner_join(
    tbl(con_sdm, "taxon_model") |>
      filter(ds_key == "rng_iucn"),
    by = "taxon_id"
  ) |>
  collect() |>
  inner_join(d_turtle_iucn, by = "scientific_name") |>
  mutate(
    value = msens::compute_er_score(glue("IUCN:{iucn_code}"))
  )

# update rng_iucn cell values per turtle species
for (j in 1:nrow(d_turtle_mdl)) {
  r <- d_turtle_mdl[j, ]
  n <- dbExecute(
    con_sdm,
    glue(
      "UPDATE model_cell SET value = {r$value}
       WHERE mdl_seq = {r$mdl_seq}"
    )
  )
  message(glue(
    "  updated {r$scientific_name} rng_iucn (mdl_seq={r$mdl_seq}): ",
    "IUCN:{r$iucn_code} -> value={r$value} ({n} cells)"
  ))
}
  updated Caretta caretta rng_iucn (mdl_seq=19857): IUCN:VU -> value=5 (430925 cells)
  updated Chelonia mydas rng_iucn (mdl_seq=19858): IUCN:LC -> value=1 (432603 cells)
  updated Eretmochelys imbricata rng_iucn (mdl_seq=19860): IUCN:CR -> value=50 (352038 cells)
  updated Lepidochelys kempii rng_iucn (mdl_seq=19861): IUCN:CR -> value=50 (70171 cells)
  updated Dermochelys coriacea rng_iucn (mdl_seq=19859): IUCN:VU -> value=5 (420435 cells)
  updated Lepidochelys olivacea rng_iucn (mdl_seq=19862): IUCN:VU -> value=5 (137612 cells)

2.4 Iterate merge across taxa

Code
ds_key <- "ms_merge"

# cell_tif <- glue("{dir_data}/derived/r_bio-oracle_planarea.tif")
# r_cell <- rast(cell_tif)
# ext(r_cell) <- round(ext(r_cell), 3)

d_x <- tbl(con_sdm, "taxon") |>
  filter(is_ok) |>
  filter(sp_cat == "turtle") |>
  arrange(desc(n_ds), taxon_id) |>
  collect()

# removed: quick fix for turtle sp_cat — now handled by reclassify_reptiles
# chunk after taxon table creation (bind_birds_notbirds_ds)

# table(d_x$sp_cat, useNA = "ifany")
# OLD:
# bird        coral         fish invertebrate       mammal      reptile
#   49           10           12            1           12            7
# NEW (rng_iucn):
# coral         fish invertebrate       mammal        other       turtle
#   375          927          148           58            2            6

ds_keys <- tbl(con_sdm, "dataset") |>
  pull(ds_key) |>
  setdiff("ms_merge")

# datasets for extracting max value SDM
# ds_keys_sdm  <- setdiff(ds_keys, "rng_iucn")   # OLD: excluding IUCN range map except for masking
ds_keys_sdm <- ds_keys # NEW: include IUCN range map for max value SDM
# datasets that form the combined mask (IUCN range + critical habitats + national range), only when global mask exists for species
ds_keys_mask <- tbl(con_sdm, "dataset") |>
  filter(is_mask) |>
  pull(ds_key)

# datasets with global_mask_priority (lower = higher priority)
ds_global_masks <- tbl(con_sdm, "dataset") |>
  filter(!is.na(global_mask_priority)) |>
  select(ds_key, global_mask_priority) |>
  collect() |>
  arrange(global_mask_priority)

t_0 <- Sys.time()
for (i in 1:nrow(d_x)) {
  # i = 1  # which(str_detect(d_x$scientific_name, ".*ricei")) # i = 1512

  d_sp <- d_x |> slice(i)
  is_turtle <- d_sp$sp_cat == "turtle"
  # d_sp |> glimpse()
  # Rows: 1
  # Columns: 18
  # $ taxon_id         <dbl> 127186
  # $ taxon_authority  <chr> "worms"
  # $ n_ds             <int> 5
  # $ am_0.05          <int> 7466
  # $ ch_nmfs          <int> 18230
  # $ ch_fws           <int> 18309
  # $ rng_fws          <int> 18401
  # $ sp_cat           <chr> "fish"
  # $ bl               <int> NA
  # $ mdl_seq          <int> NA
  # $ scientific_name  <chr> "Salmo salar"
  # $ common_name      <chr> "silver salmon"
  # $ redlist_code     <chr> "EN"
  # $ worms_is_marine  <lgl> TRUE
  # $ worms_is_extinct <lgl> NA
  # $ worms_id         <dbl> 127186
  # $ is_ok            <lgl> TRUE
  # $ rng_iucn         <int> 19445

  # list species models by dataset from taxon_model junction table
  d_sp_l <- tbl(con_sdm, "taxon_model") |>
    filter(taxon_id == !!d_sp$taxon_id) |>
    collect() |>
    mutate(
      taxon_authority = d_sp$taxon_authority
    )
  #   taxon_id taxon_authority ds_key   mdl_seq
  #      <dbl> <chr>           <chr>      <int>
  # 1   127186 worms           am_0.05     7466
  # 2   127186 worms           ch_fws     18309
  # 3   127186 worms           ch_nmfs    18230
  # 4   127186 worms           rng_fws    18401
  # 5   127186 worms           rng_iucn   19445
  # NEW ds_key: "ms_merge"; mdl_seq: 20030

  d_r_ds <- tbl(con_sdm, "model_cell") |>
    filter(
      mdl_seq %in% d_sp_l$mdl_seq
    ) |>
    left_join(
      tbl(con_sdm, "model") |>
        select(mdl_seq, ds_key) |>
        filter(mdl_seq %in% d_sp_l$mdl_seq),
      by = "mdl_seq"
    ) |>
    group_by(ds_key) |>
    summarize(
      n_cell = n(),
      v_min = min(value, na.rm = T),
      v_max = max(value, na.rm = T),
      .groups = "drop"
    ) |>
    collect() |>
    mutate(
      ds_str = glue("{ds_key} ({n_cell} cells, {v_min} - {v_max})")
    )
  #   ds_key   n_cell v_min v_max ds_str
  #   <chr>     <dbl> <int> <int> <glue>
  # 1 rng_iucn   6245    50    50 rng_iucn (6245 cells, 50 - 50)
  # 2 ch_fws       29    90    90 ch_fws (29 cells, 90 - 90)
  # 3 am_0.05   11739     1   100 am_0.05 (11739 cells, 1 - 100)
  # 4 ch_nmfs       3    90    90 ch_nmfs (3 cells, 90 - 90)
  # 5 rng_fws     264    50    90 rng_fws (264 cells, 50 - 90)

  if (is_turtle) {
    # turtle multiplicative merge (v4b) ----
    # merged_value = pmax(1, round(er_value * suit_value / 100))
    # critical habitat cells override via max

    # suitability surface: non-mask SDM datasets (e.g., am_0.05)
    ds_suit_sp <- intersect(d_sp_l$ds_key, setdiff(ds_keys_sdm, ds_keys_mask))
    # ER surface: SWOT+DPS
    ds_er_sp <- intersect(d_sp_l$ds_key, "rng_turtle_swot_dps")
    # critical habitat datasets
    ds_ch_sp <- intersect(d_sp_l$ds_key, c("ch_nmfs", "ch_fws", "ca_nmfs"))
    # all datasets used (for description)
    ds_sdm_sp <- unique(c(ds_suit_sp, ds_er_sp, ds_ch_sp))

    if (length(ds_er_sp) > 0) {
      er_mdl_seqs <- d_sp_l |>
        filter(ds_key %in% ds_er_sp) |>
        pull(mdl_seq)
      d_er <- tbl(con_sdm, "model_cell") |>
        filter(mdl_seq %in% er_mdl_seqs) |>
        select(cell_id, er_value = value) |>
        collect()

      if (length(ds_suit_sp) > 0) {
        suit_mdl_seqs <- d_sp_l |>
          filter(ds_key %in% ds_suit_sp) |>
          pull(mdl_seq)
        d_suit <- tbl(con_sdm, "model_cell") |>
          filter(mdl_seq %in% suit_mdl_seqs) |>
          group_by(cell_id) |>
          summarize(suit_value = max(value, na.rm = T), .groups = "drop") |>
          collect()
        d_r_sp <- d_er |>
          left_join(d_suit, by = "cell_id") |>
          mutate(
            suit_value = coalesce(suit_value, 1L),
            value = pmax(
              1L,
              as.integer(round(
                er_value * suit_value / 100
              ))
            )
          ) |>
          select(cell_id, value)
      } else {
        # no suitability: suit=1 everywhere
        d_r_sp <- d_er |>
          mutate(
            value = pmax(1L, as.integer(round(er_value * 1L / 100)))
          ) |>
          select(cell_id, value)
      }
    } else {
      # no SWOT+DPS (unlikely): fall back to max of all SDMs
      sdm_mdl_seqs <- d_sp_l |>
        filter(ds_key %in% intersect(d_sp_l$ds_key, ds_keys_sdm)) |>
        pull(mdl_seq)
      d_r_sp <- tbl(con_sdm, "model_cell") |>
        filter(mdl_seq %in% sdm_mdl_seqs) |>
        group_by(cell_id) |>
        summarize(value = max(value, na.rm = T), .groups = "drop") |>
        collect()
    }

    # union critical habitat cells (override via max)
    if (length(ds_ch_sp) > 0) {
      ch_mdl_seqs <- d_sp_l |>
        filter(ds_key %in% ds_ch_sp) |>
        pull(mdl_seq)
      d_ch <- tbl(con_sdm, "model_cell") |>
        filter(mdl_seq %in% ch_mdl_seqs) |>
        group_by(cell_id) |>
        summarize(value = max(value, na.rm = T), .groups = "drop") |>
        collect()
      d_r_sp <- bind_rows(d_r_sp, d_ch) |>
        group_by(cell_id) |>
        summarize(value = max(value, na.rm = T), .groups = "drop")
    }

    has_global_mask <- FALSE
    has_other_sdm <- FALSE
  } else {
    # non-turtle max merge (v4 approach, unchanged) ----

    # datasets used for SDM
    ds_sdm_sp <- intersect(d_sp_l$ds_key, ds_keys_sdm)

    sdm_mdl_seqs <- d_sp_l |>
      filter(ds_key %in% ds_sdm_sp) |>
      pull(mdl_seq)

    # query of SDM with max value across all datasets
    q_sdm <- tbl(con_sdm, "model_cell") |>
      filter(
        mdl_seq %in% sdm_mdl_seqs
      ) |>
      group_by(cell_id) |>
      summarize(value = max(value, na.rm = T), .groups = "drop")

    # find highest-priority global mask this species has
    sp_global_masks <- d_sp_l |>
      inner_join(ds_global_masks, by = "ds_key") |>
      arrange(global_mask_priority)

    has_global_mask <- nrow(sp_global_masks) > 0

    # only mask if species has at least one SDM dataset besides the global mask
    # (avoid self-intersection, e.g. birds with only BirdLife)
    has_other_sdm <- FALSE
    if (has_global_mask) {
      best_mask_ds <- sp_global_masks$ds_key[1]
      other_sdm_ds <- setdiff(ds_sdm_sp, best_mask_ds)
      has_other_sdm <- length(other_sdm_ds) > 0
    }

    if (has_global_mask && has_other_sdm) {
      # get datasets used for mask
      # exclude lower-priority global masks from the mask union
      lower_global_masks <- sp_global_masks |>
        filter(ds_key != best_mask_ds) |>
        pull(ds_key)
      ds_mask_sp <- setdiff(
        intersect(d_sp_l$ds_key, ds_keys_mask),
        lower_global_masks
      )

      # get mdl_seqs for masking datasets
      mask_mdl_seqs <- d_sp_l |>
        filter(ds_key %in% ds_mask_sp) |>
        pull(mdl_seq)

      q_mask <- tbl(con_sdm, "model_cell") |>
        filter(mdl_seq %in% mask_mdl_seqs) |>
        distinct(cell_id) |>
        mutate(value = 1)

      # get SDM cells with mask applied
      d_r_sp <- q_sdm |>
        semi_join(
          q_mask,
          by = join_by(cell_id)
        ) |>
        collect()
    } else {
      # no global mask or no other SDM to mask — just max value of all SDMs
      d_r_sp <- collect(q_sdm)
    }
  }

  # apply MMPA/MBTA spatial minimum floors ----
  if (isTRUE(d_sp$is_mmpa)) {
    d_r_sp <- d_r_sp |>
      mutate(value = pmax(value, 20L))
  }
  if (isTRUE(d_sp$is_mbta)) {
    d_r_sp <- d_r_sp |>
      mutate(value = pmax(value, 10L))
  }

  sp_sci <- d_sp$scientific_name
  sp_cmn <- d_sp$common_name
  sp_cat <- d_sp$sp_cat
  sp_key <- glue("{ds_key}:{d_sp$taxon_id}")

  # show message every 50 iterations or for the first 5 iterations
  if (i <= 5 || i %% 50 == 0) {
    eta <- Sys.time() + round((Sys.time() - t_0) / i * (nrow(d_x) - i))
    message(glue(
      "Processing {i}/{nrow(d_x)} [{sp_key}] ({sp_cat}): {sp_cmn} (_{sp_sci}_) ~ ETA: {eta}"
    ))
    # started ~15:00
  }

  n_cells <- nrow(d_r_sp)
  if (n_cells == 0) {
    message(glue("No cells for {sp_sci} ({i}/{nrow(d_x)})"))
    next()
  }

  # delete: existing ----
  mdl_seqs <- tbl(con_sdm, "model") |>
    filter(ds_key == !!ds_key, taxa == !!sp_key) |>
    pull(mdl_seq)
  if (length(mdl_seqs) > 0) {
    dbExecute(
      con_sdm,
      glue(
        "DELETE FROM taxon_model WHERE ds_key = '{ds_key}' AND taxon_id = {d_sp$taxon_id}"
      )
    )
    dbExecute(
      con_sdm,
      glue("DELETE FROM model WHERE ds_key = '{ds_key}' AND taxa = '{sp_key}'")
    )
    dbExecute(
      con_sdm,
      glue(
        "DELETE FROM species WHERE ds_key = '{ds_key}' AND taxa = '{sp_key}'"
      )
    )
    dbExecute(
      con_sdm,
      glue(
        "DELETE FROM model_cell WHERE mdl_seq IN ({paste(mdl_seqs, collapse = ',')})"
      )
    )
    dbExecute(
      con_sdm,
      glue(
        "UPDATE taxon SET mdl_seq = NULL WHERE mdl_seq IN ({paste(mdl_seqs, collapse = ',')})"
      )
    )
  }

  # append: model ----
  has_mask_str <- if (has_global_mask && has_other_sdm) {
    glue(
      "; Mask applied (global mask: {best_mask_ds}) from combining datasets: {paste(ds_mask_sp, collapse = ', ')}"
    )
  } else {
    ""
  }
  d_model <- tibble(
    ds_key = ds_key,
    taxa = sp_key,
    time_period = "2025",
    region = "USA",
    mdl_type = "mixed",
    description = if (is_turtle) {
      glue(
        "Marine Sensitivity merged model with multiplicative merge ",
        "(er * suit / 100) from datasets: {paste(ds_sdm_sp, collapse = ', ')}"
      )
    } else {
      glue(
        "Marine Sensitivity merged model with max values from datasets: ",
        "{paste(ds_sdm_sp, collapse = ', ')}{has_mask_str}"
      )
    }
  )
  dbWriteTable(con_sdm, "model", d_model, append = TRUE)
  # tbl(con_sdm, "model") |> collect() |> tail()

  # get the mdl_seq that was just created
  mdl_seq <- dbGetQuery(
    con_sdm,
    glue(
      "
    SELECT mdl_seq FROM model
    WHERE
      ds_key = '{ds_key}' AND
      taxa   = '{sp_key}'
    ORDER BY mdl_seq DESC LIMIT 1 "
    )
  )$mdl_seq

  # d_sp |> glimpse()
  # Rows: 1
  # Columns: 15
  # $ taxon_id         <dbl> 127186
  # $ taxon_authority  <chr> "worms"
  # $ n_ds             <int> 4
  # $ am_0.05          <int> 7466
  # $ ch_nmfs          <int> 18230
  # $ ch_fws           <int> 18309
  # $ rng_fws          <int> 18401
  # $ sp_cat           <chr> "fish"
  # $ bl               <int> NA
  # $ mdl_seq          <int> NA
  # $ scientific_name  <chr> "Salmo salar"
  # $ common_name      <chr> "silver salmon"
  # $ redlist_code     <chr> "EN"
  # $ worms_is_marine  <lgl> TRUE
  # $ worms_is_extinct <lgl> NA

  # append: species ----
  d_species <- tibble(
    ds_key = ds_key,
    taxa = sp_key,
    sp_key = sp_key,
    worms_id = ifelse(
      d_sp$taxon_authority == "worms",
      d_sp$taxon_id,
      NA_integer_
    ),
    botw_id = ifelse(
      d_sp$taxon_authority == "botw",
      d_sp$taxon_id,
      NA_integer_
    ),
    gbif_id = NA_integer_,
    itis_id = NA_integer_,
    iucn_id = NA_integer_,
    scientific_name_dataset = sp_sci,
    common_name_dataset = sp_cmn,
    scientific_name_accepted = sp_sci,
    common_name_accepted = sp_cmn,
    redlist_code = d_sp$redlist_code,
    redlist_year = NA_integer_,
    sp_cat = sp_cat,
    worms_is_marine = d_sp$worms_is_marine,
    worms_is_extinct = d_sp$worms_is_extinct
  )
  # d_species |> glimpse()

  stopifnot(
    length(setdiff(names(d_species), dbListFields(con_sdm, "species"))) == 0
  )
  stopifnot(
    setdiff(dbListFields(con_sdm, "species"), names(d_species)) == "sp_seq"
  )
  dbWriteTable(con_sdm, "species", d_species, append = T)
  # tbl(con_sdm, "species") |> collect() |> tail()
  # tbl(con_sdm, "model")   |> collect() |> tail()
  stopifnot(
    tbl(con_sdm, "species") |> filter(is.na(sp_seq)) |> collect() |> nrow() == 0
  )

  # append: model_cell ----
  d_mdl_cell <- d_r_sp |>
    mutate(
      mdl_seq = mdl_seq
    ) |>
    arrange(cell_id)
  dbWriteTable(con_sdm, "model_cell", d_mdl_cell, append = T)

  # update taxon with mdl_seq
  dbExecute(
    con_sdm,
    glue(
      "
    UPDATE taxon
    SET   mdl_seq  = {mdl_seq}
    WHERE taxon_id = {d_sp$taxon_id}"
    )
  )

  # append ms_merge row to taxon_model junction table
  dbWriteTable(
    con_sdm,
    "taxon_model",
    tibble(
      taxon_id = d_sp$taxon_id,
      ds_key = "ms_merge",
      mdl_seq = as.integer(mdl_seq)
    ),
    append = TRUE
  )
}
Processing 1/6 [ms_merge:137205] (turtle): Loggerhead Turtle (_Caretta caretta_) ~ ETA: 2026-03-17 16:01:00.710179
Processing 2/6 [ms_merge:137206] (turtle): Green Turtle (_Chelonia mydas_) ~ ETA: 2026-03-17 16:01:00.837863
Processing 3/6 [ms_merge:137207] (turtle): Hawksbill Turtle (_Eretmochelys imbricata_) ~ ETA: 2026-03-17 16:00:59.565287
Processing 4/6 [ms_merge:137209] (turtle): Leatherback Turtle (_Dermochelys coriacea_) ~ ETA: 2026-03-17 16:00:59.715418
Processing 5/6 [ms_merge:137208] (turtle): Kemp's Ridley Turtle (_Lepidochelys kempii_) ~ ETA: 2026-03-17 16:00:59.210924
Code
# Salmo salar            silver salmon

3 Set taxon.is_ok

Add is_ok: a simple logical field for flagging valid taxa, ie (so far):

  • birds:
    • redlist_code != “EX”
    • has a botw_id
    • if has worms_id:
      • worms_is_marine != F
      • worms_is_extinct != T
  • no model cells overlap with Program Areas
  • category is “reptile” but not “turtle” (handled by reclassify_reptiles)
  • not birds:
    • has a worms_id
    • worms_is_marine != F
    • worms_is_extinct != T

3.1 Flag valid taxa

Code
d <- tbl(con_sdm, "taxon") |>
  collect() # 17,561 × 16

# birds ----
# d |>
#   filter(
#     taxon_authority == "botw") |>
#   pull(redlist_code) |>
#   table(useNA = "ifany")
#  CR  DD  EN  LC  NT  TN  VU
#   3   1  58 452  45  14  41
#
# d |>
#   filter(
#     taxon_authority == "botw") |>
#   select(worms_is_marine, worms_is_extinct) |>
#   table(useNA = "ifany")
#                worms_is_extinct
# worms_is_marine <NA>
#           FALSE  118
#           TRUE   221
#           <NA>   275

d_b <- d |>
  filter(
    taxon_authority == "botw"
  ) |>
  mutate(
    is_ok = case_when(
      is.na(taxon_id) ~ F,
      is.na(mdl_seq) ~ F,
      !is.na(redlist_code) & redlist_code == "EX" ~ F, # 0
      !is.na(worms_id) & worms_is_marine == F ~ F, # 118
      !is.na(worms_id) & worms_is_extinct == T ~ F, # 0
      .default = T
    )
  )
# d_b$is_ok |> table(useNA = "ifany")
# FALSE  TRUE
#   118   496

# worms ----

# d |>
#   filter(
#     taxon_authority == "worms") |>
#   pull(redlist_code) |>
#   table(useNA = "ifany")
# CR    DD    EN    EX    LC    NT    TN    VU  <NA>
# 47   340   235     1  5574   107    11   154 10478
#
# d |>
#   filter(
#     taxon_authority == "worms") |>
#   select(worms_is_marine, worms_is_extinct) |>
#   table(useNA = "ifany")
#                worms_is_extinct
# worms_is_marine FALSE  TRUE  <NA>
#           FALSE    19     0    24
#           TRUE   4171    20 12708
#           <NA>      0     0     5

d_w <- d |>
  filter(
    taxon_authority == "worms"
  ) |>
  mutate(
    is_ok = case_when(
      is.na(taxon_id) ~ F,
      is.na(mdl_seq) ~ F,
      !is.na(redlist_code) & redlist_code == "EX" ~ F,
      !is.na(worms_id) & worms_is_marine == F ~ F,
      !is.na(worms_id) & worms_is_extinct == T ~ F,
      !is.na(worms_taxonomic_status) &
        !worms_taxonomic_status %in%
          c("accepted", "alternative representation") ~ F,
      sp_cat == "reptile" ~ F,
      sp_cat == "turtle" ~ T,
      .default = T
    )
  )
# d_w$is_ok |> table(useNA = "ifany")
# FALSE   TRUE
#    64 16,883

d2 <- bind_rows(
  d_b,
  d_w
) |>
  select(taxon_id, is_ok)

# flag taxa with no distribution inside program areas ----
# (mirrors r_mask approach in apps_2026/mapsp/app.R but with DB queries)
taxa_in_pra <- tbl(con_sdm, "taxon_model") |>
  inner_join(
    tbl(con_sdm, "model_cell") |> select(mdl_seq, cell_id),
    by = "mdl_seq"
  ) |>
  inner_join(
    tbl(con_sdm, "zone_cell") |> select(zone_seq, cell_id),
    by = "cell_id"
  ) |>
  inner_join(
    tbl(con_sdm, "zone") |>
      filter(
        fld == "programarea_key"
      ) |>
      select(zone_seq),
    by = "zone_seq"
  ) |>
  distinct(taxon_id) |>
  pull(taxon_id)

n_outside <- sum(d2$is_ok & !(d2$taxon_id %in% taxa_in_pra))
message(glue(
  "{n_outside} taxa flagged is_ok=F (no distribution in program areas)"
))
6734 taxa flagged is_ok=F (no distribution in program areas)
Code
d2 <- d2 |>
  mutate(
    is_ok = is_ok & taxon_id %in% taxa_in_pra
  )

stopifnot(sum(duplicated(d2$taxon_id)) == 0)

dbExecute(
  con_sdm,
  "ALTER TABLE taxon ADD COLUMN IF NOT EXISTS is_ok BOOLEAN"
)
[1] 0
Code
duckdb_register(con_sdm, "d2", d2)
dbExecute(
  con_sdm,
  "UPDATE taxon
    SET is_ok  = d2.is_ok
    FROM d2
    WHERE taxon.taxon_id = d2.taxon_id"
) # 17,561
[1] 17561
Code
duckdb_unregister(con_sdm, "d2")

# add is_er_spatial flag (TRUE for turtles where ER is spatially ----
# differentiated per-cell from mask datasets; FALSE for species using
# uniform species-level scalar er_score)
dbExecute(
  con_sdm,
  "ALTER TABLE taxon ADD COLUMN IF NOT EXISTS is_er_spatial BOOLEAN"
)
[1] 0
Code
dbExecute(
  con_sdm,
  "UPDATE taxon SET is_er_spatial = (sp_cat = 'turtle')"
)
[1] 17561

4 Taxon Summary

4.1 Export taxon summary

Code
taxon_csv <- here("data/taxon.csv")

# dbListFields(con_sdm, "taxon") |> paste(collapse = ", ") |> cat()
d_taxon <- tbl(con_sdm, "taxon") |>
  select(
    is_ok,
    component = sp_cat,
    common_name,
    scientific_name,
    redlist_code_max = redlist_code,
    extrisk_code,
    er_score,
    is_mmpa,
    is_mbta,
    is_bcc,
    worms_is_marine,
    worms_is_extinct,
    n_datasets = n_ds,
    taxon_authority,
    taxon_id,
    model_id = mdl_seq
  ) |>
  arrange(desc(is_ok), component, common_name) |>
  collect()

write_csv(d_taxon, taxon_csv)

5 Dataset Summary

Code
tbl(con_sdm, "dataset") |>
  select(sort_order, ds_key, name_display, value_info, is_mask) |>
  arrange(sort_order) |>
  collect() |>
  DT::datatable()
Code
tbl(con_sdm, "model") |>
  group_by(ds_key) |>
  summarize(n_models = n()) |>
  arrange(ds_key) |>
  collect() |>
  DT::datatable()

6 Species Summary

6.1 Species counts by sp_cat and ds_key

Code
tbl(con_sdm, "species") |>
  group_by(sp_cat, ds_key) |>
  summarize(n = n(), .groups = "drop") |>
  arrange(sp_cat, ds_key) |>
  collect() |>
  DT::datatable()

6.2 Taxonomic authority coverage

Code
tbl(con_sdm, "species") |>
  group_by(sp_cat, ds_key) |>
  summarize(
    n = n(),
    n_worms = sum(!is.na(worms_id), na.rm = T),
    n_itis = sum(!is.na(itis_id), na.rm = T),
    n_gbif = sum(!is.na(gbif_id), na.rm = T),
    n_allna = sum(
      is.na(worms_id) & is.na(itis_id) & is.na(gbif_id),
      na.rm = T
    ),
    .groups = "drop"
  ) |>
  mutate(
    pct_worms = round(n_worms / n * 100, 1),
    pct_itis = round(n_itis / n * 100, 1),
    pct_gbif = round(n_gbif / n * 100, 1),
    pct_allna = round(n_allna / n * 100, 1)
  ) |>
  arrange(sp_cat, ds_key) |>
  collect() |>
  DT::datatable()

6.3 WoRMS marine/extinct percentages

Code
tbl(con_sdm, "species") |>
  group_by(sp_cat) |>
  summarize(
    n = n(),
    n_worms_marine = sum(worms_is_marine, na.rm = T),
    n_worms_extinct = sum(worms_is_extinct, na.rm = T)
  ) |>
  mutate(
    pct_worms_marine = round(n_worms_marine / n * 100, 1),
    pct_worms_extinct = round(n_worms_extinct / n * 100, 1)
  ) |>
  collect() |>
  DT::datatable()

7 Taxon Table Summary

7.1 Taxon counts by sp_cat and is_ok

Code
tbl(con_sdm, "taxon") |>
  group_by(sp_cat, is_ok) |>
  summarize(n = n(), .groups = "drop") |>
  collect() |>
  pivot_wider(
    names_from = is_ok,
    values_from = n,
    names_prefix = "is_ok_"
  ) |>
  DT::datatable()

7.2 Redlist code distribution

Code
tbl(con_sdm, "taxon") |>
  filter(is_ok) |>
  group_by(sp_cat, redlist_code) |>
  summarize(n = n(), .groups = "drop") |>
  collect() |>
  pivot_wider(
    names_from = redlist_code,
    values_from = n,
    values_fill = 0
  ) |>
  DT::datatable()

7.3 Extinction risk authority summary

Code
tbl(con_sdm, "taxon") |>
  filter(is_ok) |>
  mutate(
    authority = case_when(
      str_starts(extrisk_code, "NMFS") ~ "NMFS",
      str_starts(extrisk_code, "FWS") ~ "FWS",
      str_starts(extrisk_code, "IUCN") ~ "IUCN",
      TRUE ~ "none"
    )
  ) |>
  count(authority) |>
  collect() |>
  DT::datatable()

7.4 Number of datasets per taxon

Code
tbl(con_sdm, "taxon") |>
  filter(is_ok) |>
  count(n_ds) |>
  collect() |>
  DT::datatable()

8 Taxon x Dataset Matrix

Code
tbl(con_sdm, "taxon_model") |>
  inner_join(
    tbl(con_sdm, "taxon") |>
      filter(is_ok) |>
      select(taxon_id, sp_cat),
    by = "taxon_id"
  ) |>
  group_by(sp_cat, ds_key) |>
  summarize(n = n(), .groups = "drop") |>
  collect() |>
  pivot_wider(
    names_from = ds_key,
    values_from = n,
    values_fill = 0
  ) |>
  DT::datatable()

9 Taxon Detail Table

Code
tbl(con_sdm, "taxon") |>
  filter(is_ok) |>
  select(
    sp_cat,
    scientific_name,
    common_name,
    redlist_code,
    extrisk_code,
    er_score,
    n_ds,
    taxon_id,
    taxon_authority
  ) |>
  arrange(sp_cat, scientific_name) |>
  collect() |>
  DT::datatable(
    filter = "top",
    options = list(pageLength = 25)
  )
Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html