Calculate scores from models loaded into sdm db and taxon table

Published

2026-03-25 15:31:40

1 Overview

This notebook calculates scores from models loaded into sdm db and taxon table, including:

  • Extinction risk metrics by class (eg extrisk_bird, extrisk_fish, etc)
  • Other metrics TBD

Source:

Process:

  • Before:
    1. ingest_taxon.qmd
    • Added con_spp.botw (Birds of the World) as another taxonomic authority with ingest_taxon.qmd 1.merge_models.qmd
    • Built con_sdm.taxon table as authoritative unique taxa reference from multiple entries in species with merge_models.qmd
  • After:
    1. Summary data/figs/README sections are now integrated into this notebook (merged from msens-summary_programareas.qmd)

To render this notebook on Ben’s laptop:

cd ~/Github/MarineSensitivity/workflows
quarto render calc_scores.qmd

2 TODO

  • include more complex models:
    • unique by: ds_key, taxa_id, time_period
    • pkey: mdl_seq
Code
librarian::shelf(
  DBI,
  dbplyr,
  dplyr,
  duckdb,
  DT,
  flextable,
  fs,
  ftExtra,
  ggiraph,
  ggplot2,
  glue,
  gt,
  here,
  htmlwidgets,
  kableExtra,
  knitr,
  leaflet,
  logger,
  mapview,
  writexl,
  purrr,
  quarto,
  RColorBrewer,
  readr,
  scales,
  sf,
  shiny,
  stringr,
  terra,
  tibble,
  tidyr,
  units,
  webshot2,
  quiet = T
)
options(readr.show_col_types = F)

stopifnot(packageVersion("terra") >= "1.8.60")
# [rotate() issue](https://github.com/rspatial/terra/issues/1876)

source(here("libs/sdm_functions.R"))
source(here("libs/paths.R"))
dir_create(dir_v)
dir_create(dir_big_v)

# copy sdm.duckdb from prior version if missing
sdm_db_prev <- glue("{dir_big}/{ver_prev}/sdm.duckdb")
if (!file_exists(sdm_db) && file_exists(sdm_db_prev)) {
  message(glue("Copying sdm.duckdb from {ver_prev} to {ver}..."))
  file_copy(sdm_db_prev, sdm_db)
  message(glue("Copied sdm.duckdb ({file_size(sdm_db)})"))
}

# inputs (shared, unversioned)
pra_raw_gpkg <- glue("{dir_raw}/boem.gov/boem_program_areas_2025-11.gpkg")
cell_tif <- glue("{dir_derived}/r_bio-oracle_planarea.tif")
stopifnot(all(file_exists(c(cell_tif, pra_raw_gpkg))))

# version-independent input polygons (find from prior versions)
# note: search stable prior versions first; current dir_v files on Google Drive
#   can be evicted during long renders
find_input <- function(fname, search_dirs) {
  for (d in search_dirs) {
    f <- glue("{d}/{fname}")
    if (file_exists(f)) return(f)
  }
  stop(glue(
    "Input not found: {fname} (searched {paste(search_dirs, collapse=', ')})"
  ))
}
prev_dirs <- glue("{dir_derived}/{c(ver_prev, 'v2', 'v1')}")
er_raw_gpkg <- find_input("ply_ecoregions_2025.gpkg", prev_dirs)
pa_gpkg <- find_input("ply_planareas_2025.gpkg", prev_dirs)

# flags, typical
do_zones <- F # no, except if new spatial zones added
do_metrics <- T # YES
do_zone_taxon <- T # slow in past
do_postgis <- F # NO, since migrated to pmtiles
do_downloads <- T # YES
do_summary_data <- T # ecoregion x PRA intersections, label points, score tables
do_summary_figs <- T # maps, flower plots, primprod charts, Word/Excel tables
do_readme <- T # programmatic README.md + README.html
do_map <- F # no, but useful for visual checks during development
do_parquet <- T # no, but produces smaller faster duckdb
do_pmtiles <- F # generate PMTiles archives for mapgl/mapsp apps
do_pra_primprod <- F # NO, unless zones or primary prod ∆; zonal primary productivity for program areas
do_deploy <- T # sync derived data + app code to server

stopifnot(!do_pmtiles || (do_pmtiles && do_downloads)) # pmtiles should be generated from downloads

# spatial outputs (year reflects input data vintage, version suffix model version of derived metric columns)
pra_gpkg <- glue("{dir_v}/ply_programareas_2026_{ver}.gpkg")
sr_gpkg <- glue("{dir_v}/ply_subregions_2026_{ver}.gpkg")

# computed outputs (version suffix only — no obvious input year)
sr_pra_csv <- find_input("subregion_programareas.csv", c(dir_v, prev_dirs))
metrics_tif <- glue("{dir_v}/r_metrics_{ver}.tif")
metrics_pra_tif <- glue("{dir_v}/r_metrics_programareas_{ver}.tif")
zone_taxon_csv <- glue("{dir_v}/zone_taxon_{ver}.csv")
lyrs_csv <- glue("{dir_v}/layers_{ver}.csv")
# file_exists(c(sr_pra_csv, metrics_tif, metrics_pra_tif, zone_taxon_csv, lyrs_csv))

# summary outputs (merged from msens-summary_programareas.qmd)
er_pra_gpkg <- glue("{dir_v}/ply_ecoregion_programarea_{ver}.gpkg")
er_pra_csv <- glue("{dir_v}/ply_ecoregion_programarea_{ver}.csv")
er_pra_n_csv <- glue("{dir_v}/tbl_er_pra_scores_{ver}.csv")
pra_n_csv <- glue("{dir_v}/tbl_pra_scores_{ver}.csv")
lbls_csv <- glue("{dir_v}/ply_label_placement_pra.csv")
dir_figs <- here(glue("figs/msens-summary_programareas_{ver}"))
tbl_pra <- glue("ply_programareas_2026_{ver}")
tbl_er <- glue("ply_ecoregions_2025")
pp_csv <- glue("{dir_v}/programarea_primprod.csv")
pp_png <- glue("{dir_v}/programarea_primprod.png")
pp_pdf <- glue("{dir_v}/programarea_primprod.pdf")
component_pct_pra_csv <- glue("{dir_v}/tbl_component_pct_programarea_{ver}.csv")

# redo flags for summary sections
redo_summary_data <- T
redo_summary_figs <- T

# parquet (big files)
dir_pq <- glue("{dir_big_v}/sdm_parquet")

# helper functions ----
get_rast <- function(m_key) {
  d <- dbGetQuery(
    con_sdm,
    glue(
      "
        SELECT
          cm.cell_id,
          cm.value
        FROM cell_metric cm
        WHERE cm.metric_seq = (
          SELECT metric_seq
          FROM metric
          WHERE metric_key = '{m_key}' )"
    )
  )
  stopifnot(sum(duplicated(d$cell_id)) == 0)

  r <- init(r_cell[[1]], NA)
  r[d$cell_id] <- d$value

  r
}

plot_flower <- function(
  data,
  fld_category,
  fld_height,
  fld_width,
  colors,
  tooltip_expr = NULL,
  score = NULL,
  title = NULL,
  interactive = T
) {
  stopifnot(is.numeric(data |> pull({{ fld_height }})))
  stopifnot(is.numeric(data |> pull({{ fld_width }})))

  if (is.null(score)) {
    score <- data |>
      mutate(
        "{{fld_height}}" := as.double({{ fld_height }}),
        "{{fld_width}}" := as.double({{ fld_width }})
      ) |>
      summarize(
        score = weighted.mean({{ fld_height }}, {{ fld_width }}, na.rm = T)
      ) |>
      pull(score)
  }

  d <- data |>
    arrange({{ fld_category }}) |>
    mutate(across(!where(is.character), as.double)) |>
    mutate(
      ymax = cumsum({{ fld_width }}),
      ymin = lag(ymax, default = 0),
      xmax = {{ fld_height }},
      xmin = 0
    )

  sym_category <- ensym(fld_category)
  sym_height <- ensym(fld_height)
  sym_width <- ensym(fld_width)

  if (!is.null(tooltip_expr)) {
    d <- d |>
      mutate(tooltip = glue(tooltip_expr))
  } else {
    d <- d |>
      mutate(tooltip = glue("{!!fld_category}"))
  }

  g <- ggplot(d) +
    geom_rect_interactive(
      aes(
        xmin = xmin,
        xmax = xmax,
        ymin = ymin,
        ymax = ymax,
        fill = {{ fld_category }},
        color = "white",
        data_id = {{ fld_category }},
        tooltip = tooltip
      ),
      color = "white",
      alpha = 0.5
    ) +
    coord_polar(theta = "y") +
    xlim(c(-10, max(data |> pull({{ fld_height }})))) +
    annotate(
      "text",
      x = -10,
      y = 0,
      label = round(score),
      size = 8,
      fontface = "bold"
    ) +
    scale_fill_manual(values = colors) +
    theme_minimal() +
    theme(
      legend.position = "bottom",
      plot.margin = unit(c(20, 20, 20, 20), "pt")
    )

  if (!is.null(title)) {
    g <- g + ggtitle(title)
  }

  if (interactive) {
    g <- girafe(
      ggobj = g,
      options = list(
        opts_sizing(rescale = TRUE, width = 1),
        opts_tooltip(
          css = "background-color:white;color:black;padding:5px;border-radius:3px;"
        )
      )
    )
  }

  g
}

# conditional summary setup ----
if (do_summary_figs) {
  mapbox_tkn_txt <- glue("{dir_private}/mapbox_token_bdbest.txt")
  Sys.setenv(MAPBOX_PUBLIC_TOKEN = readLines(mapbox_tkn_txt))
  librarian::shelf(mapgl, quiet = T)
  dir_create(dir_figs)
}

con_sdm <- dbConnect(duckdb(), dbdir = sdm_db, read_only = F)
# dbDisconnect(con_sdm, shutdown = T); duckdb_shutdown(duckdb()); rm(con_sdm)
if (do_postgis) {
  source(here("libs/db.R"))
} # generates Postgres `con`; fails if ssh-msens not connected

# clean up prior version zone references in DuckDB ----
zone_tbls_db <- tbl(con_sdm, "zone") |> distinct(tbl) |> pull(tbl)

# rename version-independent zones (ecoregions, planning areas) to remove old suffix
for (tbl_base in c("ply_ecoregions_2025", "ply_planareas_2025")) {
  old_tbl <- glue("{tbl_base}_{ver_prev}")
  if (old_tbl %in% zone_tbls_db) {
    dbExecute(
      con_sdm,
      glue("UPDATE zone SET tbl = '{tbl_base}' WHERE tbl = '{old_tbl}'")
    )
    message(glue("Renamed zone tbl: {old_tbl} -> {tbl_base}"))
  }
}

# rename versioned zones (program areas, subregions) from ver_prev to ver
for (old_tbl in zone_tbls_db[str_detect(zone_tbls_db, glue("_{ver_prev}$"))]) {
  new_tbl <- str_replace(old_tbl, glue("_{ver_prev}$"), glue("_{ver}"))
  dbExecute(
    con_sdm,
    glue("UPDATE zone SET tbl = '{new_tbl}' WHERE tbl = '{old_tbl}'")
  )
  # zone_taxon uses zone_tbl column
  dbExecute(
    con_sdm,
    glue(
      "UPDATE zone_taxon SET zone_tbl = '{new_tbl}' WHERE zone_tbl = '{old_tbl}'"
    )
  )
  message(glue("Renamed zone tbl: {old_tbl} -> {new_tbl}"))
}

sp_cats <- tbl(con_sdm, "taxon") |>
  filter(is_ok) |>
  distinct(sp_cat) |>
  arrange(sp_cat) |>
  pull(sp_cat)

# for visual checks
r_cell <- rast(cell_tif)
ext(r_cell) <- round(ext(r_cell), 3) # TODO: culprit with warlus wonk?
Code
old_csv <- glue("{dir_v}/tbl_pra_scores_v3_old-pre-mmpa-mbta-floor.csv")
out_csv <- glue("{dir_v}/tbl_pra_scores_v3_new-vs-old-pre-mmpa-mbta-floor.csv")
new_csv <- pra_n_csv

d_old <- read_csv(old_csv)
d_new <- read_csv(new_csv)
d_comp <- d_old |>
  full_join(
    d_new,
    by = join_by(`Program Area`),
    suffix = c("_old", "_new")
  ) |>
  mutate(
    Score_diff = Score_new - Score_old
  ) |>
  select(`Program Area`, Score_old, Score_new, Score_diff)
write_csv(d_comp, out_csv)

Flags for evaluating R chunks:

  • do_zones: FALSE
  • do_metrics: TRUE
  • do_zone_taxon: TRUE
  • do_postgis: FALSE
  • do_downloads: TRUE
  • do_summary_data: TRUE
  • do_summary_figs: TRUE
  • do_readme: TRUE
  • do_map: FALSE
  • do_parquet: TRUE
  • do_deploy: TRUE

3 Add Zones

3.1 Add cell table to DuckDB if missing

Code
if (!"cell" %in% dbListTables(con_sdm)) {
  if (!exists("r_cell")) {
    r_cell <- rast(cell_tif)
  }

  d_cell <- values(r_cell, dataframe = T, na.rm = T) |>
    tibble() |>
    mutate(
      cell_id = as.integer(cell_id)
    )

  # dbRemoveTable(con_sdm, "cell")
  dbWriteTable(con_sdm, "cell", d_cell, append = F)
  dbExecute(con_sdm, "ALTER TABLE cell ADD PRIMARY KEY (cell_id);")
}

3.2 Add Program Areas to SDM zones

Add BOEM Program Areas (11th National Draft Proposed Program, 2025-11) as a new zone type. Program Areas are derived from Planning Areas with additional GOA subdivisions.

Code
zone_tbls <- tbl(con_sdm, "zone") |>
  distinct(tbl) |>
  pull(tbl)

if (!tbl_pra %in% zone_tbls) {
  source(here("libs/sdm_functions.R"))
  # TODO: move these into msens R
  # - set_geom_ctr_area()
  # - register_zones()
  # - insert_zone_cell_data()

  # load planning areas for field join
  pa <- read_sf(pa_gpkg) |>
    st_drop_geometry() |>
    select(planarea_name, region_key, planarea_key)

  # load and process program areas with field renaming
  pra <- read_sf(pra_raw_gpkg) |>
    rename(
      region_name = BOEM_OCS_REGION,
      planarea_name = PLANNING_AREA,
      programarea_name = Label,
      programarea_id = OBJECTID
    ) |>
    # left join to get region_key and planarea_key from planning areas
    left_join(pa, by = "planarea_name") |>
    # programarea_key defaults to planarea_key
    mutate(
      programarea_key = planarea_key
    ) |>
    # handle NA values for GOA Program Areas (not in planning areas)
    mutate(
      programarea_key = case_when(
        programarea_name == "GOA Program Area A" ~ "GAA",
        programarea_name == "GOA Program Area B" ~ "GAB",
        TRUE ~ programarea_key
      ),
      planarea_key = case_when(
        programarea_name == "GOA Program Area A" ~ "WGA,CGA,EGA",
        programarea_name == "GOA Program Area B" ~ "CGA,EGA",
        TRUE ~ planarea_key
      ),
      region_key = case_when(
        str_detect(programarea_name, "GOA") ~ "GA",
        TRUE ~ region_key
      )
    ) |>
    select(
      programarea_id,
      region_key,
      region_name,
      planarea_key,
      planarea_name,
      programarea_key,
      programarea_name
    ) |>
    st_transform(st_crs(4326))

  # add centroid and area
  pra <- pra |> set_geom_ctr_area()

  # write derived gpkg
  write_sf(pra, pra_gpkg)
  message(glue("Wrote {nrow(pra)} program areas to {pra_gpkg}"))
  # Wrote 20 program areas to ~/My Drive/projects/msens/data/derived/ply_programareas_2026.gpkg

  # shift lon [-180, 180] to [0, 360]
  pra_sl <- st_shift_longitude(pra)

  # load cell raster template
  r_cell <- rast(cell_tif, lyrs = "cell_id")

  # get percent each feature covers each cell
  r_pra <- rasterize(
    pra_sl,
    r_cell,
    fun = "max",
    touches = T,
    cover = T,
    by = "programarea_key"
  )

  # convert to data frame
  d_pra <- terra::as.data.frame(
    r_pra,
    cells = T,
    na.rm = F,
    wide = F
  ) |>
    filter(!is.na(values)) |>
    tibble()

  message(glue("Program areas cover {length(unique(d_pra$cell))} cells"))

  # register zones and insert zone data into con_sdm
  register_zones(pra, tbl_pra, "programarea_key")
  insert_zone_cell_data(d_pra, tbl_pra, "programarea_key")

  # verify registration
  tbl(con_sdm, "zone") |>
    group_by(tbl) |>
    summarise(n_zones = n()) |>
    collect() |>
    print()
}

3.3 Create ply_programareas_2026 in PostGIS database (deprecated)

Deprecated: migrated to PMTiles. Kept for reference.

Code
# devtools::install_local("../msens")
librarian::shelf(
  here,
  leaflet,
  leaflet.extras, # mapview,
  marinesensitivity / msens,
  sf,
  tibble,
  quiet = T
)

source(here("libs/db.R")) # define: con

# load from gpkg if not already in memory (eg zones already existed)
if (!exists("pra")) {
  pra <- read_sf(pra_gpkg)
}

pra |>
  st_write(
    dsn = con,
    layer = tbl_pra,
    driver = "PostgreSQL",
    layer_options = "OVERWRITE=true"
  )

# TODO: need to convert to 1st column to integer for mapgl viewing?

# enforce SRID so shows up in tile.marinesensitivity.org
dbExecute(
  con,
  glue(
    "SELECT UpdateGeometrySRID('{tbl_pra}','geom',4326);"
  )
)

# fix any invalid geometries
dbExecute(
  con,
  glue(
    "UPDATE {tbl_pra} SET geom = ST_MakeValid(geom) WHERE NOT ST_IsValid(geom)"
  )
) # 6 rows fixed

# add spatial index for faster queries
dbExecute(
  con,
  glue(
    "CREATE INDEX IF NOT EXISTS {tbl_pra}_geom_idx ON {tbl_pra} USING gist(geom);"
  )
)

3.4 Create Subregion-ProgramArea Lookup

Create lookup table for which program areas belong to which subregions.

Used in dropdown of mapgl app.

Code
# calculate intersections between subregions and program areas using zone_cell
d_sr_pra <- tbl(con_sdm, "zone_cell") |>
  inner_join(
    tbl(con_sdm, "zone") |>
      filter(tbl == !!tbl_sr),
    by = "zone_seq"
  ) |>
  rename(subregion_key = value) |>
  select(cell_id, subregion_key) |>
  inner_join(
    tbl(con_sdm, "zone_cell") |>
      inner_join(
        tbl(con_sdm, "zone") |>
          filter(tbl == !!tbl_pra),
        by = "zone_seq"
      ) |>
      select(cell_id, programarea_key = value),
    by = "cell_id"
  ) |>
  distinct(subregion_key, programarea_key) |>
  collect()


get_sr_bbox <- function(sr_key) {
  pra_sr <- d_sr_pra |>
    filter(subregion_key == !!sr_key) |>
    pull(programarea_key)
  r <- init(r_cell[[1]], NA)
  r_sr <- r_metrics[["programarea_key"]] %in% pra_sr
  trim(r_sr) |> st_bbox() |> as.numeric()
}
# TODO: pre-calc bboxes for all subregions

write_csv(d_sr_pra, sr_pra_csv)
message(glue(
  "Wrote {nrow(d_sr_pra)} subregion-programarea mappings to {sr_pra_csv}"
))
# Wrote 60 subregion-programarea mappings to ~/My Drive/projects/msens/data/derived/subregion_programareas.csv

3.5 Add SubRegions

Original:

  • USA: USA mainland, Alaska, Hawaii and territories
  • AK: Alaska
  • L48: USA mainland
  • AKL48: USA mainland & Alaska

Conceived for later:

  • HI: Hawaii
  • L48Pac: Pacific USA mainland
  • L48Atl: Atlantic USA mainland
  • L48Goa: Gulf of America USA mainland
  • PAC: Pacific USA (Hawaii, Pacific mainland and USA Territories)

version for 2026:

  • USA: USA mainland and Alaska
  • AK: Alaska
  • L48Pac: Pacific USA mainland
  • L48Atl: Atlantic USA mainland
  • L48Goa: Gulf of America USA mainland

v3:

  • USA: All USA (Alaska, Gulf of America and Pacific)
  • AK: Alaska
  • GA: Gulf of America
  • PA: Pacific
Code
zones <- tbl(con_sdm, "zone") |>
  distinct(tbl) |>
  pull(tbl)

# delete existing tbl_sr
if (tbl_sr %in% zones) {
  dbExecute(con_sdm, glue("DELETE FROM zone WHERE tbl = '{tbl_sr}'"))
  # dbExecute(con_sdm, "DELETE FROM zone_cell WHERE zone_seq NOT IN (SELECT zone_seq FROM zone)")
}

d_sr <- tribble(
  ~subregion_key , ~subregion_name   ,
  "AK"           , "Alaska"          ,
  "GA"           , "Gulf of America" ,
  "PA"           , "Pacific"         ,
  "USA"          , "All USA"
) # note: subregion_name not used (yet)
register_zones(d_sr, tbl_sr, "subregion_key")

# load pra from gpkg if not already in memory
if (!exists("pra")) {
  if (file_exists(pra_gpkg)) {
    pra <- read_sf(pra_gpkg)
  } else {
    pra_prev <- glue(
      "{dir_derived}/{ver_prev}/ply_programareas_2026_{ver_prev}.gpkg"
    )
    pra <- read_sf(pra_prev)
  }
}

# temporarily push tmp_pra into duckdb for later joining
tmp_pra <- pra |>
  st_drop_geometry() |>
  select(programarea_key, region_key)
copy_to(con_sdm, tmp_pra, "tmp_pra", temporary = T, overwrite = T)

df_z_sr_notusa <- tbl(con_sdm, "zone") |>
  filter(
    tbl == !!tbl_sr,
    value != "USA"
  ) |>
  select(region_key = value, zone_seq_sr = zone_seq) |>
  left_join(
    tbl(con_sdm, "tmp_pra"),
    by = "region_key"
  ) |>
  left_join(
    tbl(con_sdm, "zone") |>
      filter(tbl == !!tbl_pra) |>
      select(
        programarea_key = value,
        zone_seq
      ),
    by = "programarea_key"
  ) |>
  left_join(
    tbl(con_sdm, "zone_cell"),
    by = "zone_seq"
  ) |>
  group_by(zone_seq_sr, cell_id) |>
  summarize(
    pct_covered = sum(pct_covered, na.rm = T),
    .groups = "drop"
  ) |>
  mutate(
    pct_covered = pmin(pct_covered, 100, na.rm = T)
  ) |>
  select(
    zone_seq = zone_seq_sr,
    cell_id,
    pct_covered
  ) |>
  collect() # 349,139 × 3
# summary(df_z_sr_notusa)

# duplicated(df_z_sr_notusa$cell_id) |> any() # FALSE
# TODO: check why no overlapping cells across subregions?

# zone_seqs_sr <- tbl(con_sdm, "zone") |>
#   filter(
#     tbl == !!tbl_sr) |>
#   pull(zone_seq)
# dbExecute(con_sdm, glue("
#   DELETE FROM zone_cell
#   WHERE zone_seq IN ({paste(zone_seqs_sr, collapse = ',')})"))

# append USA, ie all cells in other subregions
zone_seq_USA <- tbl(con_sdm, "zone") |>
  filter(
    tbl == !!tbl_sr,
    value == "USA"
  ) |>
  pull(zone_seq) # 217

df_z_sr_usa <- df_z_sr_notusa |>
  group_by(cell_id) |>
  summarize(
    pct_covered = sum(pct_covered, na.rm = T),
    .groups = "drop"
  ) |>
  mutate(
    zone_seq = !!zone_seq_USA,
    pct_covered = pmin(pct_covered, 100, na.rm = T)
  ) # 349,139 × 3
# summary(df_z_sr_usa)

df_z_sr <- bind_rows(
  df_z_sr_notusa,
  df_z_sr_usa
)

# TODO: ....
copy_to(
  dest = con_sdm,
  df = df_z_sr,
  name = "zone_cell",
  append = T
)

tbl(con_sdm, "zone") |>
  filter(tbl == !!tbl_sr) |>
  select(zone_seq, subregion_key = value) |>
  left_join(
    tbl(con_sdm, "zone_cell"),
    by = "zone_seq"
  ) |>
  group_by(subregion_key) |>
  summarize(n_cell = n()) |>
  arrange(subregion_key)
#   subregion_key n_cell
#   <chr>          <dbl>
# 1 AK            307,445
# 2 GA             17,274
# 3 PA             24,420
# 4 USA           349,139
Code
zone_tbls <- tbl(con_sdm, "zone") |>
  distinct(tbl) |>
  pull(tbl)

if (!"ply_subregions_2025" %in% zone_tbls) {
  source(here("libs/sdm_functions.R"))
  # - set_geom_ctr_area()
  # - register_zones()
  # - insert_zone_cell_data()

  # spatial sources
  er <- read_sf(er_raw_gpkg)
  r_cell <- rast(cell_tif, lyrs = "cell_id")

  # USA: USA mainland, Alaska, Hawaii and territories
  sr_usa <- er |>
    st_union() |>
    st_as_sf() |>
    mutate(
      subregion_key = "USA",
      subregion_name = "USA mainland, Alaska, Hawaii and territories"
    )

  # AK: Alaska
  sr_ak <- er |>
    filter(region_key == "AK") |>
    st_union() |>
    st_as_sf() |>
    mutate(
      subregion_key = "AK",
      subregion_name = "Alaska"
    )

  # L48: USA mainland
  sr_l48 <- er |>
    filter(
      ecoregion_key %in%
        c(
          "CAC",
          "WAOR",
          "SECS",
          "WCGOA",
          "EGOA",
          "NECS"
        )
    ) |>
    st_union() |>
    st_as_sf() |>
    mutate(
      subregion_key = "L48",
      subregion_name = "USA mainland"
    )

  # AKL48: USA mainland & Alaska
  sr_akl48 <- bind_rows(sr_ak, sr_l48) |>
    st_union() |>
    st_as_sf() |>
    mutate(
      subregion_key = "AKL48",
      subregion_name = "USA mainland & Alaska"
    )

  # combine
  sr <- bind_rows(
    sr_ak,
    sr_l48,
    sr_akl48,
    sr_usa
  ) |>
    set_geom_ctr_area()

  # write gpkg
  write_sf(sr, sr_gpkg)

  # shift lon [-180, 180] to [0, 360]
  sr_sl <- st_shift_longitude(sr)

  # get percent each feature covers each cell
  r_sr <- rasterize(
    sr_sl,
    r_cell,
    fun = "max",
    touches = T,
    cover = T,
    by = "subregion_key"
  )

  # convert to data frame
  d_sr <- terra::as.data.frame(
    r_sr,
    cells = T,
    na.rm = F,
    wide = F
  ) |>
    filter(
      !is.na(values)
    ) |> # 839,534 × 3
    tibble()

  # d_sr |>
  #   group_by(layer) |>
  #   summarise(n_cells  = n()) |>
  #   collect() |>
  #   arrange(desc(n_cells))
  #
  #   layer  n_cells
  #   <chr>    <int>
  # 1 USA    630,806
  # 2 AKL48  419,767
  # 3 AK     315,369
  # 4 L48    104,398

  # register zones and insert zone data into con_sdm
  register_zones(sr, "ply_subregions_2025", "subregion_key")
  insert_zone_cell_data(d_sr, "ply_subregions_2025", "subregion_key")

  # tbl(con_sdm, "zone") |>
  #   group_by(tbl) |>
  #   summarise(n_zones = n()) |>
  #   collect()
  #
  #   tbl                 n_zones
  #   <chr>                 <dbl>
  # 1 ply_ecoregions_2025      12
  # 2 ply_planareas_2025       36
  # 3 ply_subregions_2025       4

  # TODO: load ply_subregions_2025 into PostGIS
}

4 Calculate Metrics

4.1 Calculate Extinction Risk Metrics by Taxonomic Component

Code
# Start fresh with all extrisk_* metrics
existing_seqs <- tbl(con_sdm, "metric") |>
  filter(
    str_starts(metric_key, "extrisk_")
  ) |>
  pull(metric_seq)
if (length(existing_seqs) > 0) {
  metric_sql <- paste(existing_seqs, collapse = ", ")
  dbExecute(
    con_sdm,
    glue("DELETE FROM cell_metric WHERE metric_seq IN ({metric_sql})")
  )
  dbExecute(
    con_sdm,
    glue("DELETE FROM zone_metric WHERE metric_seq IN ({metric_sql})")
  )
  dbExecute(
    con_sdm,
    glue("DELETE FROM metric WHERE metric_seq IN ({metric_sql})")
  )
}
[1] 35
Code
d_taxon_ok <- tbl(con_sdm, "taxon") |>
  filter(is_ok) |>
  collect()

for (sp_cat in sp_cats) {
  # sp_cat = sp_cats[1]

  message(glue("Calculating extinction risk metrics for {sp_cat}..."))

  metric_key <- glue("extrisk_{str_replace(sp_cat, ' ', '_')}")
  description <- glue("Extinction risk for {sp_cat}")
  # TODO: rename metric_key -> metric_key, description

  # Get metric_seq first, then delete/insert
  existing_seq <- dbGetQuery(
    con_sdm,
    glue(
      "SELECT metric_seq FROM metric WHERE metric_key = '{metric_key}'"
    )
  )

  if (nrow(existing_seq) > 0) {
    metric_seq <- existing_seq$metric_seq
    dbExecute(
      con_sdm,
      glue("DELETE FROM cell_metric WHERE metric_seq = {metric_seq}")
    )
    dbExecute(
      con_sdm,
      glue("DELETE FROM zone_metric WHERE metric_seq = {metric_seq}")
    )
    dbExecute(
      con_sdm,
      glue("DELETE FROM metric WHERE metric_seq = {metric_seq}")
    )
  }

  dbExecute(
    con_sdm,
    glue(
      "
    INSERT INTO metric (metric_key, description)
    VALUES ('{metric_key}', '{description}')"
    )
  )

  metric_seq <- dbGetQuery(
    con_sdm,
    glue(
      "SELECT max(metric_seq) AS metric_seq FROM metric WHERE metric_key = '{metric_key}'"
    )
  ) |>
    pull(metric_seq)

  # use er_score from taxon (pre-computed in ingest_nmfs-fws-mmpa-mbta-listings.qmd
  # and merged in merge_models.qmd)
  # for is_er_spatial species (turtles), ER is already baked into cell values
  # so set er_score = 100 to avoid double-counting (100 * value / 100 = value)
  d_er <- tbl(con_sdm, "taxon") |>
    filter(
      is_ok,
      if (!!sp_cat == "all") T else sp_cat == !!sp_cat
    ) |>
    select(mdl_seq, sp_cat, extrisk_code, is_mbta, er_score, is_er_spatial) |>
    collect() |>
    mutate(
      # all marine mammals are MMPA-protected; recompute er_score with is_mmpa=TRUE
      # (USFWS-jurisdiction mammals like walrus missed is_mmpa flag)
      er_score = ifelse(
        sp_cat == "mammal",
        msens::compute_er_score(
          extrisk_code,
          is_mmpa = TRUE,
          is_mbta = is_mbta
        ),
        er_score
      ),
      er_score = ifelse(is_er_spatial, 100, er_score)
    ) |>
    select(mdl_seq, er_score)

  duckdb_register(con_sdm, "tmp_er", d_er |> select(mdl_seq, er_score))
  dbExecute(
    con_sdm,
    glue(
      "
    INSERT INTO cell_metric (cell_id, metric_seq, value)
    SELECT
      mc.cell_id,
      {metric_seq} AS metric_seq,
      ROUND(SUM(er.er_score * mc.value) / 100.0, 2) AS value
    FROM model_cell mc
    JOIN tmp_er er ON mc.mdl_seq = er.mdl_seq
    GROUP BY mc.cell_id"
    )
  ) # 6,662,075
  duckdb_unregister(con_sdm, "tmp_er")

  # visual check ----
  # d <- tbl(con_sdm, "cell_metric") |>
  #   filter(metric_seq == metric_seq) |>
  #   select(cell_id, value) |>
  #   collect()
  # r <- init(r_cell[[1]], NA)
  # r[d$cell_id] <- d$value
  # r_r <- rotate(r)
  # plet(r_r)
}
Calculating extinction risk metrics for bird...
Calculating extinction risk metrics for coral...
Calculating extinction risk metrics for fish...
Calculating extinction risk metrics for invertebrate...
Calculating extinction risk metrics for mammal...
Calculating extinction risk metrics for other...
Calculating extinction risk metrics for turtle...
Code
tbl(con_sdm, "metric") |>
  filter(str_starts(metric_key, "extrisk_")) |>
  arrange(metric_seq) |>
  select(-date_created) |>
  collect() |>
  print(n = Inf)
# A tibble: 7 × 5
  metric_seq metric_key           description         metric_title metric_abbrev
       <int> <chr>                <chr>               <chr>        <chr>        
1        906 extrisk_bird         Extinction risk fo… <NA>         <NA>         
2        907 extrisk_coral        Extinction risk fo… <NA>         <NA>         
3        908 extrisk_fish         Extinction risk fo… <NA>         <NA>         
4        909 extrisk_invertebrate Extinction risk fo… <NA>         <NA>         
5        910 extrisk_mammal       Extinction risk fo… <NA>         <NA>         
6        911 extrisk_other        Extinction risk fo… <NA>         <NA>         
7        912 extrisk_turtle       Extinction risk fo… <NA>         <NA>         

4.2 Calculate ExtinctionRisk min/max per SpeciesCategory and Ecoregion

Code
for (sp_cat in sp_cats) {
  # sp_cat = sp_cats[1]
  for (fxn in c("min", "max")) {
    # fxn = "min"

    m_key <- glue("extrisk_{sp_cat}_ecoregion_{fxn}")
    m_description <- glue("Extinction risk for {sp_cat} Ecoregional {fxn}")
    message(glue("m_key: {m_key}"))

    # Get metric_seq first, then delete/insert
    existing_seq <- dbGetQuery(
      con_sdm,
      glue(
        "SELECT metric_seq FROM metric WHERE metric_key = '{m_key}'"
      )
    ) |>
      pull(metric_seq)
    if (length(existing_seq) > 0) {
      dbExecute(
        con_sdm,
        glue("DELETE FROM metric      WHERE metric_seq = {existing_seq}")
      )
      dbExecute(
        con_sdm,
        glue("DELETE FROM cell_metric WHERE metric_seq = {existing_seq}")
      )
      dbExecute(
        con_sdm,
        glue("DELETE FROM zone_metric WHERE metric_seq = {existing_seq}")
      )
    }

    dbExecute(
      con_sdm,
      glue(
        "INSERT INTO metric (metric_key, description)
      VALUES ('{m_key}', '{m_description}')"
      )
    )

    # get metric ID
    m_seq <- dbGetQuery(
      con_sdm,
      glue(
        "SELECT metric_seq FROM metric WHERE metric_key = '{m_key}'"
      )
    ) |>
      pull(metric_seq)

    # calculate min/max extinction risk per ecoregion
    dbExecute(
      con_sdm,
      glue(
        "INSERT INTO zone_metric (zone_seq, metric_seq, value)
      SELECT 
        z.zone_seq,
        -- z.tbl, z.fld, z.value AS tbl_fld_val,
        {m_seq} as metric_seq,
        {fxn}(cm.value) as value
      FROM zone z
      LEFT JOIN zone_cell zc ON z.zone_seq = zc.zone_seq
      LEFT JOIN cell_metric cm ON zc.cell_id = cm.cell_id
      WHERE
        z.fld = 'ecoregion_key'
        AND cm.metric_seq IN (
          SELECT metric_seq FROM metric WHERE metric_key = 'extrisk_{sp_cat}')
      -- GROUP BY z.zone_seq, z.tbl, z.fld, z.value
      GROUP BY z.zone_seq "
      )
    )
  }
}
m_key: extrisk_bird_ecoregion_min
m_key: extrisk_bird_ecoregion_max
m_key: extrisk_coral_ecoregion_min
m_key: extrisk_coral_ecoregion_max
m_key: extrisk_fish_ecoregion_min
m_key: extrisk_fish_ecoregion_max
m_key: extrisk_invertebrate_ecoregion_min
m_key: extrisk_invertebrate_ecoregion_max
m_key: extrisk_mammal_ecoregion_min
m_key: extrisk_mammal_ecoregion_max
m_key: extrisk_other_ecoregion_min
m_key: extrisk_other_ecoregion_max
m_key: extrisk_turtle_ecoregion_min
m_key: extrisk_turtle_ecoregion_max
Code
tbl(con_sdm, "metric") |>
  filter(str_detect(metric_key, "^extrisk_.*_ecoregion_(min|max)$")) |>
  arrange(metric_seq) |>
  select(-date_created) |>
  collect() |>
  print(n = Inf)
# A tibble: 14 × 5
   metric_seq metric_key                  description metric_title metric_abbrev
        <int> <chr>                       <chr>       <chr>        <chr>        
 1        913 extrisk_bird_ecoregion_min  Extinction… <NA>         <NA>         
 2        914 extrisk_bird_ecoregion_max  Extinction… <NA>         <NA>         
 3        915 extrisk_coral_ecoregion_min Extinction… <NA>         <NA>         
 4        916 extrisk_coral_ecoregion_max Extinction… <NA>         <NA>         
 5        917 extrisk_fish_ecoregion_min  Extinction… <NA>         <NA>         
 6        918 extrisk_fish_ecoregion_max  Extinction… <NA>         <NA>         
 7        919 extrisk_invertebrate_ecore… Extinction… <NA>         <NA>         
 8        920 extrisk_invertebrate_ecore… Extinction… <NA>         <NA>         
 9        921 extrisk_mammal_ecoregion_m… Extinction… <NA>         <NA>         
10        922 extrisk_mammal_ecoregion_m… Extinction… <NA>         <NA>         
11        923 extrisk_other_ecoregion_min Extinction… <NA>         <NA>         
12        924 extrisk_other_ecoregion_max Extinction… <NA>         <NA>         
13        925 extrisk_turtle_ecoregion_m… Extinction… <NA>         <NA>         
14        926 extrisk_turtle_ecoregion_m… Extinction… <NA>         <NA>         

4.3 Calculate rescaled ExtinctionRisk per cell based on EcoregionMinMax per SpeciesCategory

Code
for (sp_cat in sp_cats) {
  # sp_cat = sp_cats[1]

  m_key <- glue("extrisk_{sp_cat}_ecoregion_rescaled")
  m_description <- glue(
    "Extinction risk for {sp_cat}, rescaled to [0,100] based on Ecoregional min/max values"
  )

  # Get metric_seq first, then delete/insert
  existing_seq <- dbGetQuery(
    con_sdm,
    glue(
      "SELECT metric_seq FROM metric WHERE metric_key = '{m_key}'"
    )
  )
  if (nrow(existing_seq) > 0) {
    m_seq <- existing_seq$metric_seq
    dbExecute(
      con_sdm,
      glue("DELETE FROM metric      WHERE metric_seq = {m_seq}")
    )
    dbExecute(
      con_sdm,
      glue("DELETE FROM cell_metric WHERE metric_seq = {m_seq}")
    )
    dbExecute(
      con_sdm,
      glue("DELETE FROM zone_metric WHERE metric_seq = {m_seq}")
    )
  }

  # insert new metric
  dbExecute(
    con_sdm,
    glue(
      "
      INSERT INTO metric (metric_key, description)
      VALUES ('{m_key}', '{m_description}')"
    )
  )
  m_seq <- dbGetQuery(
    con_sdm,
    glue(
      "
      SELECT metric_seq FROM metric WHERE metric_key = '{m_key}'"
    )
  ) |>
    pull(metric_seq)

  dbExecute(
    con_sdm,
    glue(
      "
      WITH
      -- get ecoregion_minmax from zone_metric
      ecoregion_minmax AS (
        SELECT 
          z.zone_seq,
          MIN(zm.value) as min_value,
          MAX(zm.value) as max_value
        FROM zone z
        JOIN zone_metric zm USING (zone_seq)
        WHERE 
          z.fld = 'ecoregion_key' AND 
          zm.metric_seq IN (
            SELECT metric_seq FROM metric WHERE metric_key IN (
              'extrisk_{sp_cat}_ecoregion_min',
              'extrisk_{sp_cat}_ecoregion_max') )
        GROUP BY z.zone_seq
      ),
      -- get cell_ecoregion from zone_cell with pct_covered
      cell_ecoregion_raw AS (
        SELECT 
          cell_id, 
          z.zone_seq, 
          zone_cell.pct_covered
        FROM zone_cell
        JOIN zone z ON zone_cell.zone_seq = z.zone_seq
        WHERE z.fld = 'ecoregion_key'
      ),
      -- calculate total coverage per cell for normalization
      cell_coverage_total AS (
        SELECT 
          cell_id, 
          SUM(pct_covered) as total_coverage
        FROM cell_ecoregion_raw
        GROUP BY cell_id
      ),
      -- normalize coverage percentages
      -- for cells with only one ecoregion, use 100% regardless of actual coverage
      -- for cells with multiple ecoregions, rescale to ensure percentages sum to 100
      cell_ecoregion AS (
        SELECT 
          ce.cell_id, 
          ce.zone_seq,
          CASE 
            WHEN COUNT(*) OVER (PARTITION BY ce.cell_id) = 1 THEN 100
            ELSE ce.pct_covered * 100.0 / ct.total_coverage
          END as normalized_pct
        FROM cell_ecoregion_raw ce
        JOIN cell_coverage_total ct ON ce.cell_id = ct.cell_id
      )
      -- insert rescaled extinction risk values
      INSERT INTO cell_metric (cell_id, metric_seq, value)
      SELECT 
        cm.cell_id,
        {m_seq} as metric_seq,
        SUM(
          (cm.value     - er.min_value) / 
          (er.max_value - er.min_value) * 
          (ce.normalized_pct / 100.0) ) * 100 AS value  -- scale to 0-100 range
      FROM cell_metric      cm
      JOIN cell_ecoregion   ce ON cm.cell_id  = ce.cell_id
      JOIN ecoregion_minmax er ON ce.zone_seq = er.zone_seq
      WHERE cm.metric_seq IN (
        SELECT metric_seq FROM metric WHERE metric_key = 'extrisk_{sp_cat}')
      AND cm.value IS NOT NULL
      AND er.min_value IS NOT NULL
      AND er.max_value IS NOT NULL
      AND er.min_value < er.max_value  -- avoid division by zero
      GROUP BY cm.cell_id
    "
    )
  )
}

tbl(con_sdm, "metric") |>
  filter(str_detect(metric_key, "^extrisk_.*_ecoregion_rescaled$")) |>
  arrange(metric_seq) |>
  select(-date_created) |>
  collect() |>
  print(n = Inf)
# A tibble: 7 × 5
  metric_seq metric_key                   description metric_title metric_abbrev
       <int> <chr>                        <chr>       <chr>        <chr>        
1        927 extrisk_bird_ecoregion_resc… Extinction… <NA>         <NA>         
2        928 extrisk_coral_ecoregion_res… Extinction… <NA>         <NA>         
3        929 extrisk_fish_ecoregion_resc… Extinction… <NA>         <NA>         
4        930 extrisk_invertebrate_ecoreg… Extinction… <NA>         <NA>         
5        931 extrisk_mammal_ecoregion_re… Extinction… <NA>         <NA>         
6        932 extrisk_other_ecoregion_res… Extinction… <NA>         <NA>         
7        933 extrisk_turtle_ecoregion_re… Extinction… <NA>         <NA>         

4.4 Transfer raw PrimProd into cell_metric

Code
# Create metric for primary productivity
m_key <- "primprod"
m_description <- "Primary productivity: Oregon State Vertically Generalized Production Model (VGPM) from Visible Infrared Imaging Radiometer Suite (VIIRS) satellite data (mg C / m^2 / day) from daily averages available as monthly averaged to annual and averaged to overall for the most recently available full years of data 2014 to 2023"
# TODO: consider log rescaling PrimaryProductivity

# workflow: ingest_productivity.qmd
dir_vgpm <- glue("{dir_data}/raw/oregonstate.edu")
vgpm_tif <- glue(
  "{dir_vgpm}/vgpm.r2022.v.chl.v.sst.2160x4320_2014-2023.avg.sd.tif"
)

r_cell <- rast(cell_tif) |> subset("cell_id")
ext(r_cell) <- round(ext(r_cell), 3)
# r_cell_r <- rotate(r_cell)
# ext(r_cell_r) <- round(ext(r_cell_r), 3)

r_vgpm <- rast(vgpm_tif) |> subset("npp_avg")
ext(r_vgpm) <- round(ext(r_vgpm), 3)
# r_vgpm_rc <- rotate(r_vgpm) |>
#   crop(r_cell) |>
#   mask(r_cell)

r_v_c <- resample(
  r_vgpm,
  r_cell,
  method = "bilinear"
) |>
  mask(r_cell) |>
  crop(r_cell)
# plot(r_v_c)
r_cv <- c(r_cell, r_v_c)

# Check if metric exists and delete if it does
existing_seq <- dbGetQuery(
  con_sdm,
  glue(
    "SELECT metric_seq FROM metric WHERE metric_key = '{m_key}'"
  )
)
if (nrow(existing_seq) > 0) {
  m_seq <- existing_seq$metric_seq
  dbExecute(con_sdm, glue("DELETE FROM metric      WHERE metric_seq = {m_seq}"))
  dbExecute(con_sdm, glue("DELETE FROM cell_metric WHERE metric_seq = {m_seq}"))
  dbExecute(con_sdm, glue("DELETE FROM zone_metric WHERE metric_seq = {m_seq}"))
}
[1] 32
Code
# Create new metric
dbExecute(
  con_sdm,
  glue(
    "
    INSERT INTO metric (metric_key, description)
    VALUES ('{m_key}', '{m_description}')"
  )
)
[1] 1
Code
m_seq <- dbGetQuery(
  con_sdm,
  glue(
    "
    SELECT metric_seq FROM metric WHERE metric_key = '{m_key}'"
  )
) |>
  pull(metric_seq)

d_cv <- values(r_cv, dataframe = T, na.rm = T) |>
  tibble() |>
  rename(
    value = npp_avg
  ) |>
  mutate(
    cell_id = as.integer(cell_id),
    metric_seq = !!m_seq
  )

dbWriteTable(con_sdm, "cell_metric", d_cv, append = T)

tbl(con_sdm, "metric") |>
  filter(str_detect(metric_key, "^primprod")) |>
  arrange(metric_seq) |>
  select(-date_created) |>
  collect() |>
  print(n = Inf)
# A tibble: 7 × 5
  metric_seq metric_key                   description metric_title metric_abbrev
       <int> <chr>                        <chr>       <chr>        <chr>        
1        820 primprod_ecoregion_rescaled… Pre-pct_ar… <NA>         <NA>         
2        893 primprod_ecoregion_min       Primary pr… <NA>         <NA>         
3        894 primprod_ecoregion_max       Primary pr… <NA>         <NA>         
4        895 primprod_avg                 Primary pr… <NA>         <NA>         
5        896 primprod_stddev              Primary pr… <NA>         <NA>         
6        897 primprod_ecoregion_rescaled  Primary pr… Primary Pro… Prim. Prod.  
7        934 primprod                     Primary pr… <NA>         <NA>         
Code
#   metric_seq metric_key description
#        <int> <chr>      <chr>
# 1        160 primprod   Primary productivity: Oregon State Vertically Generali…

# visual check ----
# d <- tbl(con_sdm, "cell_metric") |>
#   filter(metric_seq == m_seq) |>
#   select(cell_id, value) |>
#   collect()
# r <- init(r_cell[[1]], NA)
# r[d$cell_id] <- d$value
# r_r <- rotate(r)
# plet(r_r)

4.5 Calculate PrimProd min/max by Ecoregion

Code
pp_metric <- "primprod"
pp_seq <- dbGetQuery(
  con_sdm,
  glue(
    "SELECT metric_seq FROM metric WHERE metric_key = '{pp_metric}'"
  )
) |>
  pull(metric_seq)

for (fxn in c("min", "max")) {
  # fxn = "min"

  # get existing primprod ecoregion min/max metric_seq
  m_key <- glue("primprod_ecoregion_{fxn}")
  m_description <- glue("Primary productivity Ecoregional {fxn}")

  # Check if metric exists and delete if it does
  existing_seq <- dbGetQuery(
    con_sdm,
    glue(
      "SELECT metric_seq FROM metric WHERE metric_key = '{m_key}'"
    )
  )
  if (nrow(existing_seq) > 0) {
    m_seq <- existing_seq$metric_seq
    dbExecute(
      con_sdm,
      glue("DELETE FROM metric      WHERE metric_seq = {m_seq}")
    )
    dbExecute(
      con_sdm,
      glue("DELETE FROM cell_metric WHERE metric_seq = {m_seq}")
    )
    dbExecute(
      con_sdm,
      glue("DELETE FROM zone_metric WHERE metric_seq = {m_seq}")
    )
  }

  # insert new metric
  dbExecute(
    con_sdm,
    glue(
      "INSERT INTO metric (metric_key, description) VALUES ('{m_key}', '{m_description}')"
    )
  )

  # get metric_seq
  m_seq <- dbGetQuery(
    con_sdm,
    glue(
      "SELECT metric_seq FROM metric WHERE metric_key = '{m_key}'"
    )
  ) |>
    pull(metric_seq)

  # Calculate min/max primary productivity per ecoregion
  dbExecute(
    con_sdm,
    glue(
      "INSERT INTO zone_metric (zone_seq, metric_seq, value)
    SELECT 
      z.zone_seq,
      -- z.tbl, z.fld, z.value AS tbl_fld_val,
      {m_seq} as metric_seq,
      {fxn}(cm.value) as value
    FROM      zone        z
    LEFT JOIN zone_cell   zc ON z.zone_seq = zc.zone_seq
    LEFT JOIN cell_metric cm ON zc.cell_id = cm.cell_id
    WHERE
      z.fld         = 'ecoregion_key' AND
      cm.metric_seq = {pp_seq}
    GROUP BY z.zone_seq"
    )
  )
}
tbl(con_sdm, "metric") |>
  filter(
    metric_key == "primprod_ecoregion_min"
  ) |>
  left_join(
    tbl(con_sdm, "zone_metric"),
    by = "metric_seq"
  ) |>
  pull(value) |>
  summary()
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  35.15  135.08  267.47  257.26  366.07  454.43 
Code
# OLD:
#  Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
# 65.61  130.50  237.31  247.02  358.66  435.62
# NEW:
#  Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
# 35.15  135.08  267.47  257.26  366.07  454.43
tbl(con_sdm, "metric") |>
  filter(
    metric_key == "primprod_ecoregion_max"
  ) |>
  left_join(
    tbl(con_sdm, "zone_metric"),
    by = "metric_seq"
  ) |>
  pull(value) |>
  summary()
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  750.8  3208.6  4229.2  4185.3  5347.1  7913.8 

4.6 Calculate PrimProd avg/sd by Planning Area

Code
pp_metric <- "primprod"
pp_seq <- dbGetQuery(
  con_sdm,
  glue(
    "SELECT metric_seq FROM metric WHERE metric_key = '{pp_metric}'"
  )
) |>
  pull(metric_seq)

for (fxn in c("avg", "stddev")) {
  # fxn = "avg"

  # get existing primprod avg/stddev metric_seq
  m_key <- glue("primprod_{fxn}")
  m_description <- glue("Primary productivity {fxn}")

  # Check if metric exists and delete if it does
  existing_seq <- dbGetQuery(
    con_sdm,
    glue(
      "SELECT metric_seq FROM metric WHERE metric_key = '{m_key}'"
    )
  ) |>
    pull(metric_seq)
  if (length(existing_seq) > 0) {
    dbExecute(
      con_sdm,
      glue(
        "DELETE FROM metric      WHERE metric_seq IN ({paste(existing_seq, collapse = ',')})"
      )
    )
    dbExecute(
      con_sdm,
      glue(
        "DELETE FROM cell_metric WHERE metric_seq IN ({paste(existing_seq, collapse = ',')})"
      )
    )
    dbExecute(
      con_sdm,
      glue(
        "DELETE FROM zone_metric WHERE metric_seq IN ({paste(existing_seq, collapse = ',')})"
      )
    )
  }

  # insert new metric
  dbExecute(
    con_sdm,
    glue(
      "INSERT INTO metric (metric_key, description) VALUES ('{m_key}', '{m_description}')"
    )
  )

  # get metric_seq
  m_seq <- dbGetQuery(
    con_sdm,
    glue(
      "SELECT metric_seq FROM metric WHERE metric_key = '{m_key}'"
    )
  ) |>
    pull(metric_seq)

  stopifnot(length(m_seq) == 1)

  # calculate avg/stddev primary productivity per planning area
  # dbGetQuery(con_sdm, glue(
  #   "SELECT
  dbExecute(
    con_sdm,
    glue(
      "INSERT INTO zone_metric (zone_seq, metric_seq, value)
     SELECT
      z.zone_seq,
      {m_seq} as metric_seq,
      {fxn}(cm.value) as value
    FROM      zone        z
    LEFT JOIN zone_cell   zc ON z.zone_seq = zc.zone_seq
    LEFT JOIN cell_metric cm ON zc.cell_id = cm.cell_id
    WHERE
      z.fld         = 'planarea_key' AND
      cm.metric_seq = {pp_seq}
    GROUP BY z.zone_seq"
    )
  )
}

# tbl(con_sdm, "cell_metric") |>
#   filter(
#     metric_seq == !!pp_seq) |>
#   pull(value) |>
#   summary()

tbl(con_sdm, "metric") |>
  filter(
    metric_key == "primprod_avg"
  ) |>
  left_join(
    tbl(con_sdm, "zone_metric"),
    by = "metric_seq"
  ) |>
  pull(value) |>
  summary()
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  124.4   369.6   602.2   652.7   817.8  1930.0 

4.7 Calculate rescaled PrimProd per cell based on EcoregionMinMax

Code
pp_metric <- "primprod"
pp_seq <- dbGetQuery(
  con_sdm,
  glue(
    "SELECT metric_seq FROM metric WHERE metric_key = '{pp_metric}'"
  )
) |>
  pull(metric_seq)

# Create new metric for rescaled primary productivity
m_key <- "primprod_ecoregion_rescaled"
m_description <- "Primary productivity rescaled to [0,100] based on Ecoregional min/max values"
# TODO: consider log rescaling PrimaryProductivity

# Check if metric exists and delete if it does
existing_seq <- dbGetQuery(
  con_sdm,
  glue(
    "SELECT metric_seq FROM metric WHERE metric_key = '{m_key}'"
  )
) |>
  pull(metric_seq)
if (length(existing_seq) > 0) {
  dbExecute(
    con_sdm,
    glue("DELETE FROM metric      WHERE metric_seq = {existing_seq}")
  )
  dbExecute(
    con_sdm,
    glue("DELETE FROM cell_metric WHERE metric_seq = {existing_seq}")
  )
  dbExecute(
    con_sdm,
    glue("DELETE FROM zone_metric WHERE metric_seq = {existing_seq}")
  )
}
[1] 32
Code
# Create new metric
dbExecute(
  con_sdm,
  glue(
    "
    INSERT INTO metric (metric_key, description)
    VALUES ('{m_key}', '{m_description}')"
  )
)
[1] 1
Code
m_seq <- dbGetQuery(
  con_sdm,
  glue(
    "
    SELECT metric_seq FROM metric WHERE metric_key = '{m_key}'"
  )
) |>
  pull(metric_seq)

# Rescale primary productivity by ecoregion
# dbGetQuery(con_sdm, glue("
dbExecute(
  con_sdm,
  glue(
    "
    WITH
    -- get ecoregion_minmax from cell_metric for primary productivity
    ecoregion_minmax AS (
      SELECT 
        z.zone_seq,
        MIN(zm.value) as min_value,
        MAX(zm.value) as max_value
      FROM zone z
      JOIN zone_metric zm USING (zone_seq)
      WHERE 
        z.fld = 'ecoregion_key' AND 
        zm.metric_seq IN (
          SELECT metric_seq FROM metric WHERE metric_key IN (
            'primprod_ecoregion_min',
            'primprod_ecoregion_max') )
      GROUP BY z.zone_seq ),
    -- get cell_ecoregion from zone_cell with pct_covered
    cell_ecoregion_raw AS (
      SELECT 
        cell_id, 
        z.zone_seq, 
        zone_cell.pct_covered
      FROM zone_cell
      JOIN zone z ON zone_cell.zone_seq = z.zone_seq
      WHERE z.fld = 'ecoregion_key'
    ),
    -- calculate total coverage per cell for normalization
    cell_coverage_total AS (
      SELECT 
        cell_id, 
        SUM(pct_covered) as total_coverage
      FROM cell_ecoregion_raw
      GROUP BY cell_id
    ),
    -- normalize coverage percentages
    cell_ecoregion AS (
      SELECT 
        ce.cell_id, 
        ce.zone_seq,
        CASE 
          WHEN COUNT(*) OVER (PARTITION BY ce.cell_id) = 1 THEN 100
          ELSE ce.pct_covered * 100.0 / ct.total_coverage
        END as normalized_pct
      FROM cell_ecoregion_raw ce
      JOIN cell_coverage_total ct ON ce.cell_id = ct.cell_id
    )
    -- insert rescaled primary productivity values
    INSERT INTO cell_metric (cell_id, metric_seq, value)
    SELECT 
      cm.cell_id,
      {m_seq} as metric_seq,
      SUM(
        -- (c.prim_prod_mean - er.min_value) / 
        (cm.value     - er.min_value) / 
        (er.max_value - er.min_value) * 
        (ce.normalized_pct / 100.0) ) * 100 AS value  -- scale to 0-100 range
    -- FROM cell c
    FROM cell_metric cm
    JOIN cell_ecoregion   ce ON cm.cell_id  = ce.cell_id
    JOIN ecoregion_minmax er ON ce.zone_seq = er.zone_seq
    WHERE 
    --c.prim_prod_mean IS NOT NULL AND 
    cm.metric_seq = {pp_seq} AND
    cm.value IS NOT NULL AND 
    ce.zone_seq IS NOT NULL AND
    er.min_value IS NOT NULL AND 
    er.max_value IS NOT NULL AND 
    er.min_value < er.max_value  -- avoid division by zero
    GROUP BY cm.cell_id
"
  )
)
[1] 623920
Code
# tbl(con_sdm, "cell_metric") |>
#   filter(
#     metric_seq == !!m_seq)

tbl(con_sdm, "metric") |>
  filter(
    metric_key == "primprod_ecoregion_rescaled"
  ) |>
  left_join(
    tbl(con_sdm, "cell_metric"),
    by = "metric_seq"
  ) |>
  pull(value) |>
  summary()
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.000   2.619   5.730  10.379  15.007 100.000 

4.8 Calculate ProgramArea metrics contributing to score based on cell metrics

Code
# get zone_seqs for program areas
z_seqs <- tbl(con_sdm, "zone") |>
  filter(tbl == !!tbl_pra) |>
  pull(zone_seq)

message(glue("Calculating metrics for {length(z_seqs)} program areas..."))
Calculating metrics for 20 program areas...
Code
# Calculating metrics for 20 program areas...

metrics <- c(
  glue("extrisk_{sp_cats}"),
  glue("extrisk_{sp_cats}_ecoregion_rescaled"),
  "primprod",
  "primprod_ecoregion_rescaled"
)

for (m_key in metrics) {
  # m_key = metrics[1]

  # get existing metric_seq
  m_seq <- tbl(con_sdm, "metric") |>
    filter(metric_key == !!m_key) |>
    pull(metric_seq)

  if (length(m_seq) != 1) {
    message(glue("  Skipping {m_key} - metric not found"))
    next
  }

  # delete existing zone_metric values for this metric and program areas
  dbExecute(
    con_sdm,
    glue(
      "DELETE FROM zone_metric
    WHERE metric_seq = {m_seq}
      AND zone_seq IN ({paste(z_seqs, collapse = ',')})"
    )
  )

  # stop if no cell_metric found
  n_cm <- tbl(con_sdm, "cell_metric") |>
    filter(metric_seq == !!m_seq) |>
    tally() |>
    pull(n)

  if (n_cm == 0) {
    message(glue("  Skipping {m_key} - no cell_metric values"))
    next
  }

  # calculate average score per program area weighted by cell coverage
  dbExecute(
    con_sdm,
    glue(
      "
    INSERT INTO zone_metric (zone_seq, metric_seq, value)
    SELECT
      zc.zone_seq,
      {m_seq} as metric_seq,
      SUM(cm.value * zc.pct_covered) / SUM(zc.pct_covered) as value
    FROM zone_cell zc
    JOIN cell_metric cm ON zc.cell_id = cm.cell_id
    WHERE
      cm.metric_seq = {m_seq}
      AND zc.zone_seq IN ({paste(z_seqs, collapse = ',')})
      AND cm.value IS NOT NULL
    GROUP BY zc.zone_seq"
    )
  )

  message(glue("  Calculated {m_key}"))
}
  Calculated extrisk_bird
  Calculated extrisk_coral
  Calculated extrisk_fish
  Calculated extrisk_invertebrate
  Calculated extrisk_mammal
  Calculated extrisk_other
  Calculated extrisk_turtle
  Calculated extrisk_bird_ecoregion_rescaled
  Calculated extrisk_coral_ecoregion_rescaled
  Calculated extrisk_fish_ecoregion_rescaled
  Calculated extrisk_invertebrate_ecoregion_rescaled
  Calculated extrisk_mammal_ecoregion_rescaled
  Calculated extrisk_other_ecoregion_rescaled
  Calculated extrisk_turtle_ecoregion_rescaled
  Calculated primprod
  Calculated primprod_ecoregion_rescaled
Code
# verify
d_ck <- tbl(con_sdm, "metric") |>
  select(metric_seq, metric_key) |>
  filter(metric_key %in% metrics) |>
  left_join(
    tbl(con_sdm, "zone_metric"),
    by = "metric_seq"
  ) |>
  left_join(
    tbl(con_sdm, "zone") |>
      filter(tbl == !!tbl_pra) |>
      select(zone_seq, pra_key = value),
    by = "zone_seq"
  ) |>
  filter(!is.na(pra_key)) |>
  collect()

d_ck |>
  filter(str_detect(metric_key, "_ecoregion_rescaled")) |>
  pull(value) |>
  summary()
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.8816  9.5945 19.1908 28.7270 45.9746 98.9915 
Code
#  Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
# 1.195  11.186  21.902  27.317  41.150  91.748

4.9 Apply pct_area weighting to ProgramArea component metrics

Weight each component’s PA-level metric by the fraction of the PA’s cells that have non-NA data for that component, so components with limited spatial coverage (e.g., turtles in GEO) are down-weighted before score aggregation.

Code
# get zone_seqs for program areas
z_seqs <- tbl(con_sdm, "zone") |>
  filter(tbl == !!tbl_pra) |>
  pull(zone_seq)

# component metric keys to weight
component_keys <- c(
  glue("extrisk_{sp_cats}_ecoregion_rescaled"),
  "primprod_ecoregion_rescaled"
)

message(glue(
  "Applying pct_area weighting to {length(component_keys)} component metrics across {length(z_seqs)} program areas..."
))
Applying pct_area weighting to 8 component metrics across 20 program areas...
Code
for (m_key in component_keys) {
  # m_key = component_keys[1]

  m_seq <- tbl(con_sdm, "metric") |>
    filter(metric_key == !!m_key) |>
    pull(metric_seq)
  stopifnot(length(m_seq) == 1)

  # create _prepctareaweighting metric to preserve original values
  m_key_pre <- glue("{m_key}_prepctareaweighting")
  m_seq_pre <- tbl(con_sdm, "metric") |>
    filter(metric_key == !!m_key_pre) |>
    pull(metric_seq)

  if (length(m_seq_pre) == 0) {
    dbExecute(
      con_sdm,
      glue(
        "INSERT INTO metric (metric_key, description)
      VALUES ('{m_key_pre}', 'Pre-pct_area weighting backup of {m_key}')"
      )
    )
    m_seq_pre <- tbl(con_sdm, "metric") |>
      filter(metric_key == !!m_key_pre) |>
      pull(metric_seq)
  }

  # delete existing _prepctareaweighting values for PA zones
  dbExecute(
    con_sdm,
    glue(
      "DELETE FROM zone_metric
    WHERE metric_seq = {m_seq_pre}
      AND zone_seq IN ({paste(z_seqs, collapse = ',')})"
    )
  )

  # copy current values to _prepctareaweighting
  dbExecute(
    con_sdm,
    glue(
      "INSERT INTO zone_metric (zone_seq, metric_seq, value)
    SELECT zone_seq, {m_seq_pre} AS metric_seq, value
    FROM zone_metric
    WHERE metric_seq = {m_seq}
      AND zone_seq IN ({paste(z_seqs, collapse = ',')})"
    )
  )

  # update _ecoregion_rescaled values in place: value = value * pct_area
  dbExecute(
    con_sdm,
    glue(
      "
    UPDATE zone_metric
    SET value = zone_metric.value * sub.pct_area
    FROM (
      WITH
      zone_total AS (
        SELECT zone_seq, SUM(pct_covered) AS total_coverage
        FROM zone_cell
        WHERE zone_seq IN ({paste(z_seqs, collapse = ',')})
        GROUP BY zone_seq
      ),
      zone_metric_coverage AS (
        SELECT zc.zone_seq,
          SUM(zc.pct_covered) AS metric_coverage
        FROM zone_cell zc
        JOIN cell_metric cm ON zc.cell_id = cm.cell_id
        WHERE zc.zone_seq IN ({paste(z_seqs, collapse = ',')})
          AND cm.metric_seq = {m_seq}
          AND cm.value IS NOT NULL
        GROUP BY zc.zone_seq
      )
      SELECT
        zmc.zone_seq,
        zmc.metric_coverage / zt.total_coverage AS pct_area
      FROM zone_metric_coverage zmc
      JOIN zone_total zt ON zmc.zone_seq = zt.zone_seq
    ) AS sub
    WHERE zone_metric.zone_seq = sub.zone_seq
      AND zone_metric.metric_seq = {m_seq}
      AND zone_metric.zone_seq IN ({paste(z_seqs, collapse = ',')})"
    )
  )

  message(glue("  Applied pct_area to {m_key}"))
}
  Applied pct_area to extrisk_bird_ecoregion_rescaled
  Applied pct_area to extrisk_coral_ecoregion_rescaled
  Applied pct_area to extrisk_fish_ecoregion_rescaled
  Applied pct_area to extrisk_invertebrate_ecoregion_rescaled
  Applied pct_area to extrisk_mammal_ecoregion_rescaled
  Applied pct_area to extrisk_other_ecoregion_rescaled
  Applied pct_area to extrisk_turtle_ecoregion_rescaled
  Applied pct_area to primprod_ecoregion_rescaled
Code
# write pct_area diagnostics CSV
d_pct <- dbGetQuery(
  con_sdm,
  glue(
    "
  WITH
  zone_total AS (
    SELECT zone_seq, SUM(pct_covered) AS total_coverage
    FROM zone_cell
    WHERE zone_seq IN ({paste(z_seqs, collapse = ',')})
    GROUP BY zone_seq
  ),
  zone_metric_coverage AS (
    SELECT zc.zone_seq, cm.metric_seq,
      SUM(zc.pct_covered) AS metric_coverage
    FROM zone_cell zc
    JOIN cell_metric cm ON zc.cell_id = cm.cell_id
    WHERE zc.zone_seq IN ({paste(z_seqs, collapse = ',')})
      AND cm.metric_seq IN (
        SELECT metric_seq FROM metric
        WHERE metric_key IN ({paste0(\"'\", component_keys, \"'\", collapse = ',')})
      )
      AND cm.value IS NOT NULL
    GROUP BY zc.zone_seq, cm.metric_seq
  )
  SELECT
    z.value    AS programarea_key,
    m.metric_key,
    zm_pre.value AS value_pre,
    ROUND(zmc.metric_coverage / zt.total_coverage, 4) AS pct_area,
    zm.value   AS value_post
  FROM zone_metric_coverage zmc
  JOIN zone_total zt ON zmc.zone_seq = zt.zone_seq
  JOIN zone z ON zmc.zone_seq = z.zone_seq
  JOIN metric m ON zmc.metric_seq = m.metric_seq
  JOIN zone_metric zm
    ON zmc.zone_seq = zm.zone_seq
    AND zmc.metric_seq = zm.metric_seq
  JOIN metric m_pre
    ON m_pre.metric_key = m.metric_key || '_prepctareaweighting'
  JOIN zone_metric zm_pre
    ON zmc.zone_seq  = zm_pre.zone_seq
    AND m_pre.metric_seq = zm_pre.metric_seq
  ORDER BY z.value, m.metric_key"
  )
)
write_csv(d_pct, component_pct_pra_csv)

d_pct |>
  mutate(
    metric_key = str_replace_all(
      metric_key,
      c(
        "extrisk_" = "",
        "_ecoregion_rescaled" = "",
        "primprod" = "prim_prod"
      )
    )
  ) |>
  select(programarea_key, metric_key, pct_area) |>
  pivot_wider(names_from = metric_key, values_from = pct_area) |>
  print(n = Inf)
# A tibble: 20 × 9
   programarea_key  bird  coral  fish invertebrate mammal other  turtle
   <chr>           <dbl>  <dbl> <dbl>        <dbl>  <dbl> <dbl>   <dbl>
 1 ALA             1.000 1.000  1.000        1.000  1.000 1.000  0.884 
 2 ALB             1     1      1            1      1     1     NA     
 3 BFT             1     0.404  0.991        1      1     1     NA     
 4 BOW             1     1      1            1      1     1     NA     
 5 CEC             1     1      1            1      1     1      1     
 6 CHU             1     0.978  1            1      1     1     NA     
 7 COK             1     1      1            1      1     1      1     
 8 GAA             1     1      1            1      1     1      1     
 9 GAB             1     1      1            1      1     1      1     
10 GEO             1     1      1            1      1     1      0.0146
11 GOA             1     1      1            1      1     1      1     
12 HAR             1     0.0732 0.539        1      1     1     NA     
13 HOP             1     1      1            1      1     1     NA     
14 KOD             1     1      1            1      1     1      1     
15 MAT             1.000 1.000  1.000        1.000  1.000 1.000 NA     
16 NAV             1     1      1            1      1     1     NA     
17 NOC             1     1      1            1      1     1      1     
18 NOR             1     1      1            1      1     1     NA     
19 SHU             1.000 1.000  1.000        1.000  1.000 1.000  1.000 
20 SOC             1     1      1            1      1     1      1     
# ℹ 1 more variable: prim_prod <dbl>

4.10 Calculate Ecoregion metrics contributing to score based on cell metrics

Code
# get zone_seqs for ecoregions
z_seqs <- tbl(con_sdm, "zone") |>
  filter(
    tbl == !!tbl_er
  ) |>
  pull(zone_seq)
stopifnot(
  `No ecoregion zones found in zone table — check tbl_er value matches zone$tbl` = length(
    z_seqs
  ) >
    0
)

# tbl(con_sdm, "metric") |> pull(metric_key) |>  sort()

metrics <- c(
  glue("extrisk_{sp_cats}"),
  glue("extrisk_{sp_cats}_ecoregion_rescaled"),
  "primprod",
  "primprod_ecoregion_rescaled"
)

for (m_key in metrics) {
  # m_key = metrics[1]

  # get existing metric_seq
  m_seq <- tbl(con_sdm, "metric") |>
    filter(metric_key == !!m_key) |>
    pull(metric_seq)
  stopifnot(length(m_seq) == 1)

  # delete existing zone_metric values for this metric and ecoregions
  dbExecute(
    con_sdm,
    glue(
      "DELETE FROM zone_metric
    WHERE metric_seq = {m_seq}
      AND zone_seq IN ({paste(z_seqs, collapse = ',')})"
    )
  )

  # stop if no cell_metric found
  n_cm <- tbl(con_sdm, "cell_metric") |>
    filter(
      metric_seq == !!m_seq
    ) |>
    tally() |>
    pull(n)
  stopifnot(n_cm > 0)

  # calculate average score per ecoregion weighted by cell coverage
  dbExecute(
    con_sdm,
    glue(
      "
    INSERT INTO zone_metric (zone_seq, metric_seq, value)
    SELECT 
      zc.zone_seq,
      {m_seq} as metric_seq,
      SUM(cm.value * zc.pct_covered) / SUM(zc.pct_covered) as value
    FROM zone_cell zc
    JOIN cell_metric cm ON zc.cell_id = cm.cell_id
    WHERE 
      cm.metric_seq = {m_seq} 
      AND zc.zone_seq IN ({paste(z_seqs, collapse = ',')})
      AND cm.value IS NOT NULL
    GROUP BY zc.zone_seq"
    )
  )
}

d_ck <- tbl(con_sdm, "metric") |>
  select(metric_seq, metric_key) |>
  filter(
    metric_key %in% metrics
  ) |>
  left_join(
    tbl(con_sdm, "zone_metric"),
    by = "metric_seq"
  ) |>
  left_join(
    tbl(con_sdm, "zone") |>
      filter(
        tbl == !!tbl_er
      ) |>
      select(zone_seq, pa_key = value),
    by = "zone_seq"
  ) |>
  filter(!is.na(pa_key)) |>
  collect()

d_ck |>
  filter(str_detect(metric_key, "_ecoregion_rescaled")) |>
  pull(value) |>
  summary()
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.8816 13.3941 21.2295 29.0360 40.2489 90.1758 
Code
# OLD:
#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
# 0.1106 12.9840 24.9402 28.6058 43.1917 83.1341
# NEW:
#  Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
# 2.365  13.926  21.852  26.010  35.288  80.208
# NEWEST:
#  Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
# 2.365  13.923  22.475  27.350  38.309  80.039

4.11 Calculate cell_metric score: ExtinctionRisk + PrimProd (already EcoregionRescaled), equal weight

Code
# Create combined metric for extinction risk and primary productivity
m_key <- "score_extriskspcat_primprod_ecoregionrescaled_equalweights"
m_description <- "Combined score of extinction risk per species category and primary productivity, equally weighted (and each previously rescaled [0,100] based on Ecoregional min/max values)"

# Check if metric exists and delete if it does
m_seq <- dbGetQuery(
  con_sdm,
  glue(
    "SELECT metric_seq FROM metric WHERE metric_key = '{m_key}'"
  )
) |>
  pull(metric_seq)
if (length(m_seq) > 0) {
  dbExecute(con_sdm, glue("DELETE FROM metric      WHERE metric_seq = {m_seq}"))
  dbExecute(con_sdm, glue("DELETE FROM cell_metric WHERE metric_seq = {m_seq}"))
  dbExecute(con_sdm, glue("DELETE FROM zone_metric WHERE metric_seq = {m_seq}"))
}
[1] 20
Code
# Create new metric
dbExecute(
  con_sdm,
  glue(
    "
  INSERT INTO metric (metric_key, description)
  VALUES ('{m_key}', '{m_description}')"
  )
)
[1] 1
Code
m_seq <- dbGetQuery(
  con_sdm,
  glue(
    "
  SELECT metric_seq FROM metric WHERE metric_key = '{m_key}'"
  )
) |>
  pull(metric_seq)

# Get all metric_seqs for extinction risk metrics and primary productivity
metric_keys <- c(
  sapply(
    sp_cats,
    \(sc) glue("extrisk_{str_replace(sc, ' ', '_')}_ecoregion_rescaled"),
    USE.NAMES = F
  ),
  "primprod_ecoregion_rescaled"
)

# Create SQL to join all metrics with equal weights
dbExecute(
  con_sdm,
  glue(
    "
  WITH metric_values AS (
    SELECT 
      cm.cell_id,
      m.metric_key,
      cm.value
    FROM cell_metric cm
    JOIN metric m ON cm.metric_seq = m.metric_seq
    WHERE m.metric_key IN ({paste(shQuote(metric_keys), collapse=', ')}) )
  INSERT INTO cell_metric (cell_id, metric_seq, value)
  SELECT 
    cell_id,
    {m_seq} as metric_seq,
    ROUND(AVG(value)) AS value
  FROM metric_values
  GROUP BY cell_id"
  )
)
[1] 629771
Code
# Check the new metric
d_ck <- tbl(con_sdm, "metric") |>
  select(metric_seq, metric_key) |>
  filter(
    metric_key == !!m_key
  ) |>
  left_join(
    tbl(con_sdm, "cell_metric"),
    by = "metric_seq"
  ) |>
  collect()

d_ck |>
  pull(value) |>
  summary()
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   0.00   17.00   23.00   24.66   29.00   95.00 
Code
# OLD:
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
# 0.00   10.00   16.00   21.28   31.00   89.00
# NEW:
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
#  0.00   12.00   17.00   21.29   26.00   89.00
# NEWEST:
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
#  0.0    12.0    18.0    22.1    28.0    89.0

4.12 Calculate Score per ProgramArea from ProgramArea metrics

Code
# metric: calculate score per program area
m_key <- "score_extriskspcat_primprod_ecoregionrescaled_equalweights"
m_seq <- tbl(con_sdm, "metric") |>
  filter(metric_key == !!m_key) |>
  pull(metric_seq)

metric_input_keys <- c(
  glue("extrisk_{sp_cats}_ecoregion_rescaled"),
  "primprod_ecoregion_rescaled"
)
m_input_seqs <- tbl(con_sdm, "metric") |>
  filter(metric_key %in% metric_input_keys) |>
  pull(metric_seq)
stopifnot(length(metric_input_keys) == length(m_input_seqs))

# get zone_seqs for program areas
z_seqs <- tbl(con_sdm, "zone") |>
  filter(
    tbl == !!tbl_pra
  ) |>
  pull(zone_seq)

# DEBUG check
# tbl(con_sdm, "zone_metric") |>
#   filter(
#     metric_seq == !!m_seq,
#     zone_seq %in% z_seqs) |>
#   arrange(desc(value)) |>
#   pull(value) |>
#   range()
# 5.704735 61.050130
#   View()
#   nrow() -> n_existing_zm
# z_seq == 134 # (ALA)

# delete existing zone_metric values for this metric and planning areas
dbExecute(
  con_sdm,
  glue(
    "DELETE FROM zone_metric 
  WHERE metric_seq = {m_seq} 
    AND zone_seq IN ({paste(z_seqs, collapse = ',')})"
  )
)
[1] 0
Code
# simple average of component metrics (pct_area already baked into each component)
dbExecute(
  con_sdm,
  glue(
    "INSERT INTO zone_metric (zone_seq, metric_seq, value)
  SELECT
    zone_seq,
    {m_seq} AS metric_seq,
    SUM(value) / COUNT(value) AS value
  FROM zone_metric
  WHERE
    metric_seq IN ({paste(m_input_seqs, collapse = ',')})
    AND zone_seq IN ({paste(z_seqs, collapse = ',')})
    AND value IS NOT NULL
  GROUP BY zone_seq"
  )
) # 20
[1] 20
Code
# check the new metric
d_ck <- tbl(con_sdm, "metric") |>
  select(metric_seq, metric_key) |>
  filter(
    metric_key == !!m_key
  ) |>
  left_join(
    tbl(con_sdm, "zone_metric"),
    by = "metric_seq"
  ) |>
  left_join(
    tbl(con_sdm, "zone") |>
      filter(
        tbl == !!tbl_pra
      ) |>
      select(zone_seq, pa_key = value),
    by = "zone_seq"
  ) |>
  filter(!is.na(pa_key)) |>
  collect()

d_ck |>
  pull(value) |>
  summary()
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   7.52   21.58   27.80   27.77   32.34   52.66 
Code
# PlanAreas:
#  Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
#  7.16   17.10   22.86   25.42   31.59   60.07
# NEW ProgramAreas:
#  Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
# 7.642  17.372  24.131  26.439  32.972  60.068

4.13 Metric labels

Add display label columns to metric table as single source of truth for display names used in score tables, Word/Excel output, and flower plots.

Code
# add display label columns to metric table if missing
for (col in c("metric_title", "metric_abbrev")) {
  cols <- dbGetQuery(
    con_sdm,
    "SELECT column_name FROM information_schema.columns WHERE table_name = 'metric'"
  )
  if (!col %in% cols$column_name) {
    dbExecute(con_sdm, glue("ALTER TABLE metric ADD COLUMN {col} VARCHAR"))
  }
}

# populate labels for score-contributing metrics
metric_labels <- tribble(
  ~metric_key                                                  , ~metric_title        , ~metric_abbrev ,
  "score_extriskspcat_primprod_ecoregionrescaled_equalweights" , "Score"              , "Score"        ,
  "extrisk_bird_ecoregion_rescaled"                            , "Bird"               , "Bird"         ,
  "extrisk_coral_ecoregion_rescaled"                           , "Coral"              , "Coral"        ,
  "extrisk_fish_ecoregion_rescaled"                            , "Fish"               , "Fish"         ,
  "extrisk_invertebrate_ecoregion_rescaled"                    , "Invertebrate"       , "Invertebrate" ,
  "extrisk_mammal_ecoregion_rescaled"                          , "Mammal"             , "Mammal"       ,
  "extrisk_other_ecoregion_rescaled"                           , "Other"              , "Other"        ,
  "extrisk_turtle_ecoregion_rescaled"                          , "Turtle"             , "Turtle"       ,
  "primprod_ecoregion_rescaled"                                , "Primary Production" , "Prim. Prod."
)

duckdb_register(con_sdm, "tmp_labels", metric_labels)
dbExecute(
  con_sdm,
  "
  UPDATE metric
  SET metric_title  = t.metric_title,
      metric_abbrev = t.metric_abbrev
  FROM tmp_labels t
  WHERE metric.metric_key = t.metric_key"
)
[1] 9
Code
duckdb_unregister(con_sdm, "tmp_labels")

5 Add/update zone_taxon

For showing species composition per zone (subregion, program area) in mapgl app.

Code
tbl_taxon <- tbl(con_sdm, "taxon") |>
  filter(is_ok) |>
  select(
    sp_cat,
    sp_common = common_name,
    sp_scientific = scientific_name,
    taxon_id,
    taxon_authority,
    rl_code = extrisk_code,
    er_score,
    is_er_spatial,
    is_mmpa,
    is_mbta,
    is_bcc,
    esa_code,
    esa_source,
    mdl_seq
  )

d <- tbl(con_sdm, "zone") |>
  filter(
    tbl %in% c(tbl_sr, tbl_pra)
  ) |>
  select(zone_seq, zone_tbl = tbl, zone_fld = fld, zone_value = value) |> # zone_tbl = tbl,
  inner_join(
    tbl(con_sdm, "zone_cell") |>
      select(zone_seq, cell_id),
    by = join_by(zone_seq)
  ) |>
  inner_join(
    tbl(con_sdm, "model_cell") |>
      select(mdl_seq, cell_id, value) |>
      inner_join(
        tbl_taxon,
        by = join_by(mdl_seq)
      ),
    by = join_by(cell_id)
  ) |>
  inner_join(
    tbl(con_sdm, "cell") |>
      select(cell_id, area_km2),
    by = join_by(cell_id)
  ) |>
  group_by(
    zone_tbl,
    zone_fld,
    zone_value, # zone_seq,
    mdl_seq,
    sp_cat,
    sp_common,
    sp_scientific,
    taxon_id,
    taxon_authority,
    rl_code,
    er_score,
    is_er_spatial,
    is_mmpa,
    is_mbta,
    is_bcc,
    esa_code,
    esa_source
  ) |>
  summarize(
    area_km2 = sum(area_km2, na.rm = T),
    avg_suit = mean(value, na.rm = T) / 100,
    .groups = "drop"
  ) |>
  collect()


d <- d |>
  mutate(
    # for is_er_spatial (turtles), ER is already baked into cell values
    suit_rl = ifelse(is_er_spatial, avg_suit, avg_suit * er_score / 100),
    suit_rl_area = suit_rl * area_km2
  ) |>
  group_by(zone_fld, zone_value, sp_cat) |>
  mutate(
    cat_suit_rl_area = sum(suit_rl_area, na.rm = T)
  ) |>
  ungroup() |>
  mutate(
    pct_cat = suit_rl_area / cat_suit_rl_area
  )

write_csv(d, zone_taxon_csv)
dbWriteTable(con_sdm, glue("zone_taxon"), d, overwrite = T)

6 Update PostGIS layers with metrics from DuckDB

These layers are used in the mapgl app (via pg_tileserv) and need to be updated with the new metrics calculated in DuckDB. The update_table_with_clear function adds new metric columns if they don’t exist, clears existing values for those metrics, and then updates with new values from DuckDB.

6.1 Update Zone Tables in PostGIS with Metrics

Code
update_table_with_clear <- function(con, df, tbl, key_col = "programarea_key") {
  # df = d_pa_metrics; tbl = "ply_planareas_2025"; key_col = "planarea_key"

  stopifnot(dbExistsTable(con, tbl))
  existing_cols <- dbListFields(con, tbl)
  stopifnot(key_col %in% existing_cols)
  stopifnot(key_col %in% names(df))
  source_cols <- setdiff(names(df), key_col)

  # Add missing columns
  for (col in source_cols) {
    if (!col %in% existing_cols) {
      add_col_query <- glue(
        "ALTER TABLE {tbl} ADD COLUMN {col} DOUBLE PRECISION"
      )
      dbExecute(con, add_col_query)
      message("Added column: ", col)
    }
  }

  # Clear existing values for columns we're updating
  existing_source_cols <- intersect(source_cols, existing_cols)
  if (length(existing_source_cols) > 0) {
    clear_clauses <- paste0(existing_source_cols, " = NULL", collapse = ", ")
    clear_query <- glue("UPDATE {tbl} SET {clear_clauses}")
    dbExecute(con, clear_query)
    message(
      "Cleared existing values for: ",
      paste(existing_source_cols, collapse = ", "),
      "\n"
    )
  }

  # Now update with new values
  dbWriteTable(
    con,
    "temp_metrics_update",
    df,
    temporary = TRUE,
    overwrite = TRUE
  )

  set_clauses <- paste0(source_cols, " = temp.", source_cols, collapse = ", ")
  update_query <- glue(
    "
    UPDATE {tbl} 
    SET {set_clauses}
    FROM temp_metrics_update temp
    WHERE {tbl}.{key_col} = temp.{key_col}
  "
  )

  rows_affected <- dbExecute(con, update_query)
  message("Updated ", rows_affected, " rows")

  dbRemoveTable(con, "temp_metrics_update", force = TRUE)

  return(rows_affected)
}

# update zone.tbl to include version suffix ----
zone_tbls <- tbl(con_sdm, "zone") |> distinct(tbl) |> pull(tbl)
for (zt in zone_tbls) {
  zt_v <- if (!grepl("_v\\d+$", zt)) paste0(zt, v_sfx) else zt
  if (zt != zt_v) {
    # dbExecute(
    #   con_sdm,
    #   glue(
    #     "UPDATE zone SET tbl = '{zt_v}' WHERE tbl = '{zt}'"
    #   )
    # )
    cat(glue("zone.tbl: {zt} -> {zt_v} ~ {Sys.time()}"))
  }
}
Code
# create versioned PostGIS tables from existing tables ----
pg_zone_tbls <- c(
  tbl_er,
  tbl_pra
)
for (pg_tbl in pg_zone_tbls) {
  # pg_tbl already includes v_sfx (e.g. _v3) from tbl_er, tbl_pra
  pg_tbl_base <- str_remove(pg_tbl, v_sfx)
  if (!dbExistsTable(con, pg_tbl)) {
    dbExecute(con, glue("CREATE TABLE {pg_tbl} AS SELECT * FROM {pg_tbl_base}"))
    message(glue("Created {pg_tbl} from {pg_tbl_base}"))
  } else {
    message(glue("{pg_tbl} already exists"))
  }
}

6.2 Load Program Area Scores to PostGIS

Code
# delete zone_metric rows without a metric_key
dbExecute(
  con_sdm,
  "
  DELETE FROM zone_metric 
  WHERE metric_seq NOT IN (SELECT metric_seq FROM metric)"
) # 0

# check if table exists in PostGIS
if (!dbExistsTable(con, tbl_pra)) {
  stop("Run create_pg_v3_tables R chunk above")
}

# get program area metrics from DuckDB
d_pra_metrics <- tbl(con_sdm, "zone") |>
  filter(tbl == !!tbl_pra) |>
  select(programarea_key = value, zone_seq) |>
  left_join(
    tbl(con_sdm, "zone_metric") |>
      mutate(value = round(value, 1)),
    by = "zone_seq"
  ) |>
  left_join(
    tbl(con_sdm, "metric") |>
      select(metric_seq, metric_key),
    by = "metric_seq"
  ) |>
  select(programarea_key, metric_key, value) |>
  filter(
    !is.na(programarea_key),
    !is.na(metric_key),
    !is.na(value)
  ) |>
  pivot_wider(
    names_from = metric_key,
    values_from = value
  ) |>
  collect()
message(glue("Loaded {nrow(d_pra_metrics)} program area metrics"))

# For complete refresh (clears existing values first)
update_table_with_clear(
  con,
  d_pra_metrics,
  tbl_pra,
  "programarea_key"
)
Code
flds_pra_df <- names(d_pra_metrics)
flds_pra_db <- dbListFields(con, tbl_pra)
fdls_pra_rm <- setdiff(flds_pra_db, flds_pra_df) |>
  str_subset("(extrisk_)|(^na$)")

d_rm <-
  tibble(
    tbl = tbl_pra,
    fld = fdls_pra_rm
  )

for (i in seq_len(nrow(d_rm))) {
  # i = 1
  tbl <- d_rm$tbl[i]
  fld <- d_rm$fld[i]

  a <- dbExecute(con, glue('ALTER TABLE {tbl} DROP COLUMN "{fld}"'))
  message("Removed column: ", fld, " from ", tbl)
}

6.3 Add index for vector tiles

per Feature id | pg_tileserv

Code
# add integer primary key to postgres programareas table
dbExecute(
  con,
  glue(
    "ALTER TABLE public.{tbl_pra} ALTER COLUMN programarea_id TYPE INTEGER;"
  )
)
dbExecute(
  con,
  glue(
    "ALTER TABLE public.{tbl_er} ADD COLUMN IF NOT EXISTS ecoregion_id SERIAL PRIMARY KEY;"
  )
)

7 Generate downloads

For sharing with BOEM and external partners.

Code
redo_dl <- T

# ecoregion gpkg destination (input copy, no metrics — just copy to dir_v)
er_gpkg_dst <- glue("{dir_v}/ply_ecoregions_2025.gpkg")

# redo_dl: delete existing files
if (redo_dl) {
  for (f in c(
    pra_gpkg,
    lyrs_csv,
    metrics_tif,
    metrics_pra_tif,
    er_gpkg_dst
  )) {
    if (file_exists(f)) {
      file_delete(f)
    }
  }
}

# * pra_gpkg (with metrics from DuckDB) ----
if (!file.exists(pra_gpkg)) {
  message("Generating Program Areas geopackage with metrics...")

  # get metrics from DuckDB
  d_pra_metrics <- tbl(con_sdm, "zone") |>
    filter(tbl == !!tbl_pra) |>
    select(programarea_key = value, zone_seq) |>
    left_join(
      tbl(con_sdm, "zone_metric") |>
        mutate(value = round(value, 1)),
      by = "zone_seq"
    ) |>
    left_join(
      tbl(con_sdm, "metric") |>
        select(metric_seq, metric_key),
      by = "metric_seq"
    ) |>
    select(programarea_key, metric_key, value) |>
    filter(!is.na(metric_key), !is.na(value)) |>
    pivot_wider(names_from = metric_key, values_from = value) |>
    collect()

  # read base geometry (spatial columns only, no metrics)
  base_cols <- c(
    "programarea_id",
    "region_key",
    "region_name",
    "planarea_key",
    "planarea_name",
    "programarea_key",
    "programarea_name",
    "ctr_lon",
    "ctr_lat",
    "area_km2"
  )
  pra_prev_gpkg <- glue(
    "{dir_derived}/{ver_prev}/ply_programareas_2026_{ver_prev}.gpkg"
  )
  pra_base <- read_sf(pra_prev_gpkg) |>
    select(any_of(base_cols))

  # join metrics to geometry
  pra_out <- pra_base |>
    left_join(d_pra_metrics, by = "programarea_key")

  st_write(pra_out, pra_gpkg, delete_dsn = T, quiet = T)
}
Generating Program Areas geopackage with metrics...
Code
# * lyrs_csv ----
if (!file.exists(lyrs_csv)) {
  message("Generating layers CSV...")

  d_lyrs <- bind_rows(
    tibble(
      order = 1,
      category = "Overall",
      layer = "score",
      lyr = "score_extriskspcat_primprod_ecoregionrescaled_equalweights"
    ),
    tibble(
      order = 2,
      category = "Species, rescaled by Ecoregion",
      layer = glue("{sp_cats}: ext. risk, ecorgn"),
      lyr = glue("extrisk_{sp_cats}_ecoregion_rescaled")
    ),
    tibble(
      order = 3,
      category = "Primary Productivity, rescaled by Ecoregion",
      layer = glue("prim prod, ecorgn"),
      lyr = glue("primprod_ecoregion_rescaled")
    ),
    tibble(
      order = 4,
      category = "Species, raw Extinction Risk",
      layer = glue("{sp_cats}: ext. risk"),
      lyr = glue("extrisk_{sp_cats}")
    ),
    tibble(
      order = 5,
      category = "Primary Productivity, raw Phytoplankton",
      lyr = "primprod",
      layer = "prim prod, 2014-2023 avg (mg C/m^2/day)"
    )
  )

  write_csv(d_lyrs, lyrs_csv)
}
Generating layers CSV...
Code
# * metrics_tif ----
if (!file.exists(metrics_tif)) {
  message("Generating Metrics raster...")

  d_lyrs <- read_csv(lyrs_csv)

  # add zones
  lst <- list()
  for (zone_fld in c("ecoregion_key", "programarea_key")) {
    # zone_fld = "ecoregion_key"
    # zone_fld = "programarea_key"

    d <- tbl(con_sdm, "zone") |>
      filter(fld == !!zone_fld) |>
      left_join(
        tbl(con_sdm, "zone_cell"),
        by = "zone_seq"
      ) |>
      group_by(cell_id) |>
      window_order(pct_covered) |>
      summarize(
        value = last(value),
        .groups = "drop"
      ) |>
      collect() |>
      mutate(
        "{zone_fld}" := as.factor(value)
      ) |>
      select(all_of(c("cell_id", zone_fld)))

    lvls <- levels(d[[zone_fld]])
    if (length(lvls) == 0) {
      message(glue("Skipping {zone_fld}: no zone data found"))
      next
    }

    r <- init(r_cell[[1]], NA) # |> as.factor() #sort(unique(d$value)))
    # r$chr <- NA_character_ # ensure chr type
    r[d$cell_id] <- d[[zone_fld]]
    varnames(r) <- zone_fld
    names(r) <- zone_fld
    levels(r) <- tibble(
      id = seq_along(lvls),
      "{zone_fld}" := lvls
    )
    lst[[zone_fld]] <- r
  }
  r_zones <- do.call(c, lst |> unname())

  # add layers
  lst <- list()
  for (i in 1:nrow(d_lyrs)) {
    # i = 2
    lyr <- d_lyrs$lyr[i]
    layer <- d_lyrs$layer[i]

    r <- get_rast(lyr) # plot(r)
    names(r) <- lyr
    varnames(r) <- layer
    lst[[i]] <- r
  }
  r_metrics <- do.call(c, lst)

  # add log of primprod
  r_metrics[["primprod_log"]] <- log(r_metrics[["primprod"]])

  # show layer names
  names(r_metrics) |> sort() |> print()

  r_metrics <- c(r_zones, r_metrics)
  writeRaster(r_metrics, metrics_tif, overwrite = T)
}
Generating Metrics raster...
 [1] "extrisk_bird"                                              
 [2] "extrisk_bird_ecoregion_rescaled"                           
 [3] "extrisk_coral"                                             
 [4] "extrisk_coral_ecoregion_rescaled"                          
 [5] "extrisk_fish"                                              
 [6] "extrisk_fish_ecoregion_rescaled"                           
 [7] "extrisk_invertebrate"                                      
 [8] "extrisk_invertebrate_ecoregion_rescaled"                   
 [9] "extrisk_mammal"                                            
[10] "extrisk_mammal_ecoregion_rescaled"                         
[11] "extrisk_other"                                             
[12] "extrisk_other_ecoregion_rescaled"                          
[13] "extrisk_turtle"                                            
[14] "extrisk_turtle_ecoregion_rescaled"                         
[15] "primprod"                                                  
[16] "primprod_ecoregion_rescaled"                               
[17] "primprod_log"                                              
[18] "score_extriskspcat_primprod_ecoregionrescaled_equalweights"
Code
# * metrics_pra_tif ----
if (!file.exists(metrics_pra_tif)) {
  message("Generating Metrics raster masked to Program Areas...")
  # mask to study area of Program Areas

  # subregion_keys in sr_pra_csv: USA (all 20), AK (15), PA (3), GA (2)
  # AK + L48 = USA (all program areas)
  sr <- "USA"
  pra_sr <- read_csv(sr_pra_csv) |>
    filter(subregion_key == !!sr) |>
    pull(programarea_key)

  r_metrics <- rast(metrics_tif)
  r_mask <- subset(r_metrics, "programarea_key")

  r_metrics_pra <- r_metrics |>
    mask(r_mask) |>
    trim()

  writeRaster(r_metrics_pra, metrics_pra_tif, overwrite = T)
}
Generating Metrics raster masked to Program Areas...

|---------|---------|---------|---------|
=========================================
                                          
Code
# * er_gpkg_dst (input, no metrics — copy to dir_v for downloads) ----
if (!file.exists(er_gpkg_dst) && er_raw_gpkg != er_gpkg_dst) {
  message("Copying ecoregion gpkg to dir_v...")
  file_copy(er_raw_gpkg, er_gpkg_dst)
}
Copying ecoregion gpkg to dir_v...
Code
pa_metrics_geo <- glue(
  "{dir_derived}/OCS_Proposed_Planning_Area_Polygon_Env_Sensitivity_Metrics.geojson"
)

# * pa_metrics_geo ----
if (!file.exists(pa_metrics_geo)) {
  pa_boem_geo <- glue("{dir_data}/raw/boem.gov/OCS_Area_Polygons.geojson")

  pa <- read_sf(pa_gpkg)

  pa_boem <- read_sf(pa_boem_geo)

  (pa_miss <- pa_boem |>
    st_drop_geometry() |>
    anti_join(
      pa |>
        st_drop_geometry() |>
        select(planarea_key, planarea_name),
      by = join_by(PLANNING_AREA == planarea_key)
    ) |>
    pull(PLANNING_AREA))
  # [1] "CG" "EG" "WG"

  pa_boem |>
    filter(PLANNING_AREA %in% pa_miss) |>
    mapview::mapView(zcol = "PLANNING_AREA")

  pa |>
    st_drop_geometry() |>
    filter(
      region_name == "Gulf of America"
    ) |>
    select(planarea_key, planarea_name)
  #   planarea_key planarea_name
  #   <chr>        <chr>
  # 1 CGA          Central Gulf of America
  # 2 EGA          Eastern Gulf of America
  # 3 WGA          Western Gulf of America

  pa_metrics <- pa_boem |>
    left_join(
      pa |>
        st_drop_geometry() |>
        mutate(
          PLANNING_AREA = case_match(
            planarea_key,
            "CGA" ~ "CG",
            "EGA" ~ "EG",
            "WGA" ~ "WG",
            .default = planarea_key
          )
        ),
      by = join_by(PLANNING_AREA)
    )

  write_sf(pa_metrics, pa_metrics_geo)
}

8 Generate PMTiles

Static PMTiles archives for mapgl/mapsp apps, replacing PostGIS/pg_tileserv tile pipeline.

Code
# check / install tippecanoe and pmtiles CLI ----
check_brew_bin <- function(bin) {
  if (Sys.which(bin) == "") {
    if (Sys.which("brew") == "") {
      stop(glue("{bin} not found and homebrew not available"))
    }
    message(glue("installing {bin} via homebrew..."))
    system2("brew", c("install", bin))
    stopifnot(Sys.which(bin) != "")
  }
  message(glue("{bin}: {Sys.which(bin)}"))
}
check_brew_bin("tippecanoe") # tippecanoe: /opt/homebrew/bin/tippecanoe
check_brew_bin("pmtiles") # pmtiles:       /opt/homebrew/bin/pmtiles

dir_create(dir_pmtiles)

# helper: sf → pmtiles via tippecanoe ----
sf_to_pmtiles <- function(sf_obj, layer_name, out_dir = dir_pmtiles) {
  tmp_geojson <- tempfile(fileext = ".geojson")
  on.exit(unlink(tmp_geojson), add = TRUE)

  out_pmtiles <- glue("{out_dir}/{layer_name}.pmtiles")

  # wrap polygons crossing the antimeridian so GeoJSON [-180,180] doesn't clip
  #sf_obj <- st_wrap_dateline(sf_obj, options = c("WRAPDATELINE=YES"))

  st_write(sf_obj, tmp_geojson, driver = "GeoJSON") #, delete_dsn = TRUE)

  # tippecanoe: full detail for small polygon layers
  system2(
    "tippecanoe",
    c(
      "-z12",
      "-Z0",
      "--no-feature-limit",
      "--no-tile-size-limit",
      "-l",
      layer_name,
      "-o",
      out_pmtiles,
      "--force",
      tmp_geojson
    )
  )
  stopifnot(file_exists(out_pmtiles))
  message(glue("wrote {out_pmtiles} ({file_size(out_pmtiles)})"))
  out_pmtiles
}

# program areas WITH metrics ----
d_pra_metrics <- tbl(con_sdm, "zone") |>
  filter(tbl == !!tbl_pra) |>
  select(programarea_key = value, zone_seq) |>
  left_join(
    tbl(con_sdm, "zone_metric") |>
      mutate(value = round(value, 1)),
    by = "zone_seq"
  ) |>
  left_join(
    tbl(con_sdm, "metric") |>
      select(metric_seq, metric_key),
    by = "metric_seq"
  ) |>
  select(programarea_key, metric_key, value) |>
  filter(!is.na(programarea_key), !is.na(metric_key), !is.na(value)) |>
  pivot_wider(names_from = metric_key, values_from = value) |>
  collect()

# load geometry from memory or prior version (pra_gpkg may not exist yet)
pra_geom <- if (exists("pra")) {
  pra
} else if (file_exists(pra_gpkg)) {
  read_sf(pra_gpkg)
} else {
  pra_prev <- glue(
    "{dir_derived}/{ver_prev}/ply_programareas_2026_{ver_prev}.gpkg"
  )
  read_sf(pra_prev)
}

# strip any existing metric columns before joining fresh ones
base_cols <- c(
  "programarea_id",
  "region_key",
  "region_name",
  "planarea_key",
  "planarea_name",
  "programarea_key",
  "programarea_name",
  "ctr_lon",
  "ctr_lat",
  "area_km2"
)
pra_sf <- pra_geom |>
  select(any_of(base_cols)) |>
  left_join(d_pra_metrics, by = "programarea_key")

# mapView(pra_sf, zcol = "score_extriskspcat_primprod_ecoregionrescaled_equalweights")

# program areas for pmtiles: base columns only (scores joined at render time)
pra_pm_sf <- pra_geom |> select(any_of(base_cols))
sf_to_pmtiles(pra_pm_sf, tbl_pra_pm)

# plot(pra_sf["score_extriskspcat_primprod_ecoregionrescaled_equalweights"])

# ecoregions WITHOUT metrics (outlines only) ----
er_sf <- read_sf(er_raw_gpkg)
sf_to_pmtiles(er_sf, tbl_er_pm)

9 Summary data products

Merged from msens-summary_programareas.qmd: ecoregion × program area intersections, label placement points, and score tables.

Code
# delete existing summary data files if redo (except lbls_csv — manually adjusted)
if (redo_summary_data) {
  for (f in c(er_pra_gpkg, er_pra_csv, er_pra_n_csv, pra_n_csv)) {
    if (file_exists(f)) {
      file_delete(f)
    }
  }
}

9.1 Ecoregion × Program Area intersections

Code
er <- read_sf(er_raw_gpkg)
pra <- read_sf(pra_gpkg)

if (!file.exists(er_pra_gpkg)) {
  er_pra <- st_intersection(
    er |>
      select(ecoregion_key, ecoregion_name),
    pra |>
      select(region_key, region_name, programarea_key, programarea_name)
  ) |>
    st_make_valid() |>
    mutate(
      area_km2 = st_area(geom) |> set_units(km^2) |> drop_units(),
      ctr_lon = st_coordinates(st_centroid(geom))[, 1],
      ctr_lat = st_coordinates(st_centroid(geom))[, 2]
    ) |>
    filter(area_km2 > 1) |>
    arrange(programarea_key, ecoregion_key) |>
    group_by(programarea_key) |>
    mutate(
      pct_programarea_in_ecoregion = round(
        area_km2 / sum(area_km2) * 100,
        2
      ),
      pct_programarea_in_ecoregion = na_if(
        pct_programarea_in_ecoregion,
        100.0
      )
    ) |>
    ungroup()

  er_pra <- er_pra |>
    select(
      region_key,
      region_name,
      ecoregion_key,
      ecoregion_name,
      programarea_key,
      programarea_name,
      pct_programarea_in_ecoregion,
      area_km2,
      ctr_lon,
      ctr_lat
    ) |>
    arrange(region_key, ecoregion_key, programarea_key) |>
    mutate(
      area_km2 = round(area_km2, 2),
      ctr_lon = round(ctr_lon, 4),
      ctr_lat = round(ctr_lat, 4)
    )

  er_pra |>
    st_drop_geometry() |>
    write_csv(er_pra_csv, na = "")

  write_sf(er_pra, er_pra_gpkg, delete_dsn = T)
}
Warning: attribute variables are assumed to be spatially constant throughout
all geometries
Code
er_pra <- read_sf(er_pra_gpkg)

9.2 Label placement points

Code
er_pts <- er |>
  st_drop_geometry() |>
  select(
    ecoregion_key,
    ecoregion_name,
    region_name,
    region_key,
    ctr_lon,
    ctr_lat
  ) |>
  st_as_sf(coords = c("ctr_lon", "ctr_lat"), crs = 4326)
pra_pts <- pra |>
  st_drop_geometry() |>
  select(
    programarea_key,
    programarea_name,
    region_name,
    region_key,
    ctr_lon,
    ctr_lat
  ) |>
  st_as_sf(coords = c("ctr_lon", "ctr_lat"), crs = 4326)

# keep file-exists guard — label placement is manually adjusted
if (!file.exists(lbls_csv)) {
  bind_rows(
    er_pts |> st_drop_geometry() |> select(ecoregion_key),
    pra_pts |> st_drop_geometry() |> select(programarea_key)
  ) |>
    arrange(ecoregion_key, programarea_key) |>
    mutate(
      text_anchor = ifelse(!is.na(programarea_key), "center", "top"),
      text_justify = "center",
      text_offset_right = 0,
      text_offset_down = 0
    ) |>
    write_csv(lbls_csv, na = "")
}

9.3 Score tables

Code
d_pra <- read_sf(pra_gpkg) |>
  st_drop_geometry()

# get field-to-label mapping from metric table (single source of truth)
d_metric_labels <- tbl(con_sdm, "metric") |>
  filter(!is.na(metric_title)) |>
  select(metric_key, metric_title, metric_abbrev) |>
  collect()

# fields present in program area data that have labels
flds <- intersect(names(d_pra), d_metric_labels$metric_key)

# named vector: metric_key -> short name for column renaming
flds_rn <- d_metric_labels |>
  filter(metric_key %in% flds) |>
  mutate(
    short = metric_key |>
      str_replace(
        "score_extriskspcat_primprod_ecoregionrescaled_equalweights",
        "score"
      ) |>
      str_replace("_ecoregion_rescaled", "") |>
      str_replace("extrisk_", "")
  ) |>
  pull(short, name = metric_key)

# ecoregion header rows with NA for all metric columns
na_cols <- setNames(
  rep(list(NA_real_), length(flds_rn) + 1),
  c("area_km2", unname(flds_rn))
)
d_er_n <- read_csv(er_pra_csv) |>
  distinct(
    region_key,
    region_name,
    ecoregion_key,
    ecoregion_name
  ) |>
  bind_cols(as_tibble(na_cols))

d_pra_n <- read_csv(er_pra_csv) |>
  select(
    region_name,
    ecoregion_name,
    programarea_key,
    programarea_name,
    pct_programarea_in_ecoregion
  ) |>
  left_join(
    d_pra |>
      select(all_of(c("programarea_key", "area_km2", flds))) |>
      rename_with(~ flds_rn[.x], all_of(flds)),
    by = "programarea_key"
  )

d_er_pra_n <- bind_rows(
  d_er_n |> mutate(row_type = "ecoregion"),
  d_pra_n |> mutate(row_type = "programarea")
) |>
  relocate(
    row_type,
    programarea_key,
    programarea_name,
    pct_programarea_in_ecoregion,
    .after = ecoregion_name
  ) |>
  arrange(region_name, ecoregion_name, row_type, programarea_name) |>
  mutate(
    pct = ifelse(
      !is.na(pct_programarea_in_ecoregion),
      glue(" [{round(pct_programarea_in_ecoregion, 1)}%]", .trim = F),
      ""
    ),
    row_lbl = case_match(
      row_type,
      "ecoregion" ~ glue("{region_name}: {ecoregion_name} ({ecoregion_key})"),
      "programarea" ~ glue("· {programarea_name} ({programarea_key}){pct}")
    )
  ) |>
  relocate(row_lbl)
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `row_lbl = case_match(...)`.
Caused by warning:
! `case_match()` was deprecated in dplyr 1.2.0.
ℹ Please use `recode_values()` instead.
Code
d_er_pra_n |>
  select(
    row_type,
    region_name,
    ecoregion_name,
    ecoregion_key,
    programarea_name,
    programarea_key,
    pct_programarea_in_ecoregion,
    row_lbl,
    score,
    primprod,
    any_of(sp_cats),
    area_km2
  ) |>
  write_excel_csv(er_pra_n_csv)

10 Summary figures and tables

Merged from msens-summary_programareas.qmd: maps, flower plots, primary productivity charts, and Word/Excel tables.

Code
dir_create(dir_figs)

# delete figs if redo requested
if (redo_summary_figs) {
  fig_files <- dir_ls(dir_figs)
  if (length(fig_files) > 0) {
    file_delete(fig_files)
  }
}

10.1 Program Area boundary maps

Code
var_pra <- "score_extriskspcat_primprod_ecoregionrescaled_equalweights"
d_lbls <- read_csv(lbls_csv)

for (is_ak in c(TRUE, FALSE)) {
  if (is_ak) {
    pra_area <- pra |>
      filter(region_key == "AK") |>
      st_shift_longitude()
    er_area <- er |>
      filter(region_key == "AK") |>
      st_shift_longitude()

    er_pts_area <- er_pts |>
      filter(region_key == "AK") |>
      st_shift_longitude() |>
      left_join(
        d_lbls |> select(-programarea_key),
        by = "ecoregion_key"
      )
    pra_pts_area <- pra_pts |>
      filter(region_key == "AK") |>
      st_shift_longitude() |>
      left_join(
        d_lbls |> select(-ecoregion_key),
        by = "programarea_key"
      )

    lgnd_pos <- "top-left"
  } else {
    box_l48 <- pra |>
      filter(region_key != "AK") |>
      st_bbox() |>
      st_as_sfc()
    pra_area <- pra |>
      st_filter(box_l48, .predicate = st_intersects)
    er_area <- er |>
      st_filter(box_l48, .predicate = st_intersects)

    er_pts_area <- er_pts |>
      filter(ecoregion_key %in% er_area$ecoregion_key) |>
      left_join(
        d_lbls |> select(-programarea_key),
        by = "ecoregion_key"
      )
    pra_pts_area <- pra_pts |>
      filter(programarea_key %in% pra_area$programarea_key) |>
      left_join(
        d_lbls |> select(-ecoregion_key),
        by = "programarea_key"
      )

    lgnd_pos <- "bottom-left"
  }

  m <- mapboxgl(
    style = mapbox_style("dark"),
    projection = "globe"
  ) |>
    fit_bounds(bbox = er_area) |>
    msens::add_pmfill(
      url = glue("{pmtiles_base_url}/{tbl_pra_pm}.pmtiles"),
      source_layer = tbl_pra_pm,
      col_key = "programarea_key",
      d = pra_area |> sf::st_drop_geometry(),
      col_value = var_pra,
      filter_keys = pra_area$programarea_key
    )

  legend_meta <- attr(m, "legend_meta")

  m <- m |>
    msens::add_pmline(list(
      list(
        url = glue("{pmtiles_base_url}/{tbl_er_pm}.pmtiles"),
        source_layer = tbl_er_pm,
        line_color = "lightgray",
        line_width = 5,
        filter = c("in", "ecoregion_key", er_area$ecoregion_key)
      )
    )) |>
    msens::add_pmlabel(list(
      list(source = pra_pts_area, text_field = "programarea_key"),
      list(
        source = er_pts_area,
        text_field = "ecoregion_name",
        text_color = "black",
        text_halo_color = "lightgray",
        text_halo_width = 2,
        text_halo_blur = 2,
        text_line_height = 1.5,
        text_size = 20
      )
    )) |>
    mapgl::add_legend(
      "Score",
      values = legend_meta$rng,
      colors = legend_meta$colors,
      position = lgnd_pos
    ) |>
    mapgl::add_scale_control(position = "bottom-right")

  b <- glue("{dir_figs}/map_pra_{ifelse(is_ak, 'ak', 'l48')}{v_sfx}")
  b_html <- glue("{b}.html")
  b_files <- glue("{b}_files")
  b_png <- glue("{b}.png")
  saveWidget(m, b_html, selfcontained = FALSE, background = "transparent")
  webshot(
    b_html,
    b_png,
    vwidth = 1200,
    vheight = 800,
    zoom = 2,
    selector = "#htmlwidget_container",
    delay = 2,
    quiet = TRUE
  )
  file_delete(b_html)
  dir_delete(b_files)
}

10.2 Cell score & NPP maps

Code
vgpm_tif <- glue(
  "{dir_data}/raw/oregonstate.edu/vgpm.r2022.v.chl.v.sst.2160x4320_2014-2023.avg.sd_planareas.tif"
)

r_metrics <- rast(metrics_tif)
r_vgpm <- rast(vgpm_tif)
d_lbls <- read_csv(lbls_csv)

lyr_score <- "score_extriskspcat_primprod_ecoregionrescaled_equalweights"
lyr_npp <- "npp_avg"

# mask to program areas to exclude Atlantic coast
ply_mask <- vect(pra |> st_shift_longitude())
r_score <- r_metrics[[lyr_score]] |> mask(ply_mask)
r_npp <- r_vgpm |> log() |> mask(ply_mask)

n_cols <- 11
rng_r_score <- range(values(r_score, na.rm = T))
cols_r_score <- rev(RColorBrewer::brewer.pal(n_cols, "Spectral"))
brks_r_score <- seq(rng_r_score[1], rng_r_score[2], length.out = n_cols)

rng_r_npp <- range(values(r_npp, na.rm = T))
cols_r_npp <- pal_viridis()(n_cols)
brks_r_npp <- seq(rng_r_npp[1], rng_r_npp[2], length.out = n_cols)

for (is_vgpm in c(T, F)) {
  # npp: is_vgpm = T # score: is_vgpm = F
  if (is_vgpm) {
    r <- r_npp
    rng_r <- round(rng_r_npp, 2)
    cols_r <- cols_r_npp
    brks_r <- brks_r_npp
    lyr <- lyr_npp
    title <- "Primary Productivity (log(NPP))"
  } else {
    r <- r_score
    rng_r <- rng_r_score
    cols_r <- cols_r_score
    brks_r <- brks_r_score
    lyr <- lyr_score
    title <- "Score"
  }

  for (is_ak in c(T, F)) {
    # is_ak = T
    if (is_ak) {
      rgn_mask <- pra |>
        filter(region_key == "AK") |>
        st_shift_longitude() |>
        vect()

      r_area <- r |>
        mask(rgn_mask) |>
        trim()

      er_area <- er |>
        filter(region_key == "AK") |>
        st_shift_longitude()

      pra_area <- pra |>
        filter(region_key == "AK") |>
        st_shift_longitude()

      er_pts_area <- er_pts |>
        filter(region_key == "AK") |>
        st_shift_longitude() |>
        left_join(
          d_lbls |> select(-programarea_key),
          by = "ecoregion_key"
        )
      pra_pts_area <- pra_pts |>
        filter(region_key == "AK") |>
        st_shift_longitude() |>
        left_join(
          d_lbls |> select(-ecoregion_key),
          by = "programarea_key"
        )

      lgnd_pos <- "top-left"
    } else {
      rgn_mask <- pra |>
        filter(region_key != "AK") |>
        st_shift_longitude() |>
        vect()

      r_area <- r |>
        mask(rgn_mask) |>
        trim()

      box_lower48 <- pra |>
        filter(region_key != "AK") |>
        st_bbox() |>
        st_as_sfc()

      er_area <- er |>
        st_filter(box_lower48, .predicate = st_intersects)
      pra_area <- pra |>
        st_filter(box_lower48, .predicate = st_intersects)

      er_pts_area <- er_pts |>
        filter(ecoregion_key %in% er_area$ecoregion_key) |>
        left_join(
          d_lbls |> select(-programarea_key),
          by = "ecoregion_key"
        )
      pra_pts_area <- pra_pts |>
        filter(programarea_key %in% pra_area$programarea_key) |>
        left_join(
          d_lbls |> select(-ecoregion_key),
          by = "programarea_key"
        )

      lgnd_pos <- "bottom-left"
    }
    scale_pos <- "bottom-right"

    er_filter <- c("in", "ecoregion_key", er_area$ecoregion_key)
    pra_filter <- c("in", "programarea_key", pra_area$programarea_key)

    m <- msens::map_cells(
      r = r_area,
      colors = cols_r,
      bounds = er_area,
      raster_opacity = 0.9,
      legend_title = title,
      legend_position = lgnd_pos,
      legend_values = rng_r,
      pmtiles_outlines = list(
        list(
          url = glue("{pmtiles_base_url}/{tbl_er_pm}.pmtiles"),
          source_layer = tbl_er_pm,
          line_color = "lightgray",
          line_width = 5,
          filter = er_filter
        ),
        list(
          url = glue("{pmtiles_base_url}/{tbl_pra_pm}.pmtiles"),
          source_layer = tbl_pra_pm,
          line_color = "black",
          line_width = 1,
          filter = pra_filter
        )
      ),
      labels = list(
        list(source = pra_pts_area, text_field = "programarea_key"),
        list(
          source = er_pts_area,
          text_field = "ecoregion_name",
          text_color = "black",
          text_halo_color = "lightgray",
          text_halo_width = 2,
          text_halo_blur = 2,
          text_line_height = 1.5,
          text_size = 20
        )
      )
    )

    b <- glue(
      "{dir_figs}/map_cell_{ifelse(is_vgpm, 'npp', 'score')}_{ifelse(is_ak, 'ak', 'l48')}{v_sfx}"
    )
    b_html <- glue("{b}.html")
    b_files <- glue("{b}_files")
    b_png <- glue("{b}.png")
    saveWidget(m, b_html, selfcontained = F, background = "transparent")
    webshot(
      b_html,
      b_png,
      vwidth = 1200,
      vheight = 800,
      zoom = 2,
      selector = "#htmlwidget_container",
      delay = 2,
      quiet = T
    )
    file_delete(b_html)
    dir_delete(b_files)
  }
}

10.3 Ecoregion × Program Area table (docx)

Code
tbl_docx <- glue("{dir_figs}/table_ecoregion_programarea_scores{v_sfx}.docx")

set_flextable_defaults(
  font.family = "Arial",
  font.size = 10,
  border.color = "gray",
  big.mark = ","
)

# build rename vector from metric table: metric_abbrev -> short_name
abbrev_rn <- d_metric_labels |>
  filter(metric_key %in% flds) |>
  mutate(short = flds_rn[metric_key]) |>
  filter(short != "score") |>
  pull(short, name = metric_abbrev)

d_er_pra_n |>
  mutate(
    area_kkm2 = area_km2 / 1000
  ) |>
  mutate(across(where(is.numeric), round)) |>
  mutate(
    row_lbl = ifelse(
      row_type == "ecoregion",
      glue("**{row_lbl}**"),
      row_lbl
    )
  ) |>
  select(
    Area = row_lbl,
    Score = score,
    all_of(abbrev_rn),
    `Area (1,000 km^2^)` = area_kkm2
  ) |>
  flextable() |>
  ftExtra::colformat_md(part = "all") |>
  theme_vanilla() |>
  save_as_docx(path = tbl_docx)

# browseURL(tbl_docx)

10.4 Program Area table (docx + xlsx)

Code
tbl_docx <- glue("{dir_figs}/table_programarea_scores{v_sfx}.docx")
tbl_xlsx <- glue("{dir_figs}/table_programarea_scores{v_sfx}.xlsx")

set_flextable_defaults(
  font.family = "Arial",
  font.size = 10,
  border.color = "gray",
  big.mark = ","
)

# build rename vector: metric_title -> short_name
title_rn <- d_metric_labels |>
  filter(metric_key %in% flds) |>
  mutate(short = flds_rn[metric_key]) |>
  filter(short != "score") |>
  pull(short, name = metric_title)

d_pra_only <- d_er_pra_n |>
  filter(row_type == "programarea") |>
  arrange(desc(score)) |>
  select(
    -row_type,
    -ecoregion_name,
    -ecoregion_key,
    -pct_programarea_in_ecoregion
  ) |>
  mutate(
    area_kkm2 = area_km2 / 1000
  ) |>
  mutate(across(where(is.numeric), round)) |>
  mutate(
    row_lbl = glue("{programarea_name} ({programarea_key})")
  ) |>
  select(
    row_lbl,
    score,
    all_of(unname(title_rn)),
    area_kkm2
  ) |>
  distinct() |>
  select(
    `Program Area` = row_lbl,
    Score = score,
    all_of(title_rn),
    `Area (1,000 km^2^)` = area_kkm2
  )

write_excel_csv(d_pra_only, pra_n_csv, na = "")

d_pra_only |>
  flextable() |>
  ftExtra::colformat_md(part = "all") |>
  theme_vanilla() |>
  save_as_docx(path = tbl_docx)

# xlsx version (plain column names without superscript)
d_pra_only |>
  rename(`Area (1,000 km2)` = `Area (1,000 km^2^)`) |>
  openxlsx::write.xlsx(tbl_xlsx)

# browseURL(tbl_docx)

10.5 Compare scores with previous version

Code
# compare pra scores between two versions ----
# returns list with long (d_comp), wide diff (d_comp_wide), and
# wide prev/curr/diff (d_comp_pvt) data frames; NULL if files missing
compare_scores <- function(ver_curr, ver_prev) {
  csv_curr <- glue("{dir_derived}/{ver_curr}/tbl_pra_scores_{ver_curr}.csv")
  csv_prev <- glue("{dir_derived}/{ver_prev}/tbl_pra_scores_{ver_prev}.csv")

  if (!file_exists(csv_prev) || !file_exists(csv_curr)) {
    message(glue(
      "Skipping {ver_curr} vs {ver_prev}: ",
      "prev={file_exists(csv_prev)}, curr={file_exists(csv_curr)}"
    ))
    return(NULL)
  }

  d_prev <- read_csv(csv_prev, show_col_types = FALSE) |>
    rename(`Area (1,000 km2)` = `Area (1,000 km^2^)`)
  d_curr <- read_csv(csv_curr, show_col_types = FALSE) |>
    rename(`Area (1,000 km2)` = `Area (1,000 km^2^)`)

  # component columns (exclude Program Area and Area)
  comp_cols <- intersect(
    setdiff(names(d_prev), c("Program Area", "Area (1,000 km2)")),
    setdiff(names(d_curr), c("Program Area", "Area (1,000 km2)"))
  )

  # use version names for value columns (e.g., Score_v4, Score_v4)
  v_c <- ver_curr
  v_p <- ver_prev

  d_comp <- d_prev |>
    select(`Program Area`, all_of(comp_cols)) |>
    pivot_longer(
      cols = all_of(comp_cols),
      names_to = "Component",
      values_to = v_p
    ) |>
    full_join(
      d_curr |>
        select(`Program Area`, all_of(comp_cols)) |>
        pivot_longer(
          cols = all_of(comp_cols),
          names_to = "Component",
          values_to = v_c
        ),
      by = c("Program Area", "Component")
    ) |>
    mutate(
      diff = .data[[v_c]] - .data[[v_p]]
    ) |>
    arrange(Component, `Program Area`)

  comp_csv <- glue("{dir_v}/tbl_pra_scores_{ver_curr}_vs_{ver_prev}.csv")
  write_csv(d_comp, comp_csv)
  message(glue("Wrote score comparison to {comp_csv}"))

  # wide diff table
  d_comp_wide <- d_comp |>
    select(`Program Area`, Component, diff) |>
    pivot_wider(
      names_from = Component,
      values_from = diff
    ) |>
    arrange(`Program Area`)

  # wide prev/curr/diff pivot (like Excel comparison) ----
  # only rows where Score differs; columns: Score_v4, Score_v4, Score_diff, ...
  pra_diff <- d_comp |>
    filter(Component == "Score", diff != 0) |>
    pull(`Program Area`)

  score_diff_col <- glue("Score_diff")

  if (length(pra_diff) > 0) {
    d_comp_pvt <- d_comp |>
      filter(`Program Area` %in% pra_diff) |>
      pivot_wider(
        names_from = Component,
        values_from = c(all_of(v_p), all_of(v_c), diff),
        names_glue = "{Component}_{.value}"
      ) |>
      # rename ver columns: Score_v4 / Score_v4 / Score_diff
      rename_with(
        ~ str_replace(.x, glue("_{v_p}$"), glue("_{v_p}")),
        ends_with(glue("_{v_p}"))
      ) |>
      rename_with(
        ~ str_replace(.x, glue("_{v_c}$"), glue("_{v_c}")),
        ends_with(glue("_{v_c}"))
      ) |>
      select(
        `Program Area`,
        starts_with("Score_"),
        !starts_with("Score_")
      ) |>
      arrange(desc(.data[[score_diff_col]]))

    pvt_xlsx <- glue(
      "{dir_v}/tbl_pra_scores_{ver_curr}_vs_{ver_prev}_pivot.xlsx"
    )
    write_xlsx(d_comp_pvt, pvt_xlsx)
    message(glue("Wrote pivot comparison to {pvt_xlsx}"))
  } else {
    d_comp_pvt <- NULL
    message(glue("No score differences between {ver_curr} and {ver_prev}"))
  }

  list(
    d_comp = d_comp,
    d_comp_wide = d_comp_wide,
    d_comp_pvt = d_comp_pvt,
    comp_cols = comp_cols,
    ver_curr = ver_curr,
    ver_prev = ver_prev
  )
}

# compare ver vs ver_prev (v5 vs v4) ----
res <- compare_scores(ver, ver_prev)
Wrote score comparison to ~/My Drive/projects/msens/data/derived/v6/tbl_pra_scores_v6_vs_v5.csv
Wrote pivot comparison to ~/My Drive/projects/msens/data/derived/v6/tbl_pra_scores_v6_vs_v5_pivot.xlsx
Code
if (!is.null(res)) {
  res$d_comp_wide |>
    DT::datatable(
      caption = glue("Score differences ({res$ver_curr} - {res$ver_prev})"),
      options = list(pageLength = 25, dom = "t")
    ) |>
    DT::formatRound(columns = res$comp_cols, digits = 0)
}
Code
# pivot table: rows where scores differ (ver vs ver_prev) ----
if (!is.null(res) && !is.null(res$d_comp_pvt)) {
  res$d_comp_pvt |>
    DT::datatable(
      caption = glue(
        "Score pivot where different ({res$ver_curr} vs {res$ver_prev})"
      ),
      options = list(pageLength = 25, dom = "t", scrollX = TRUE)
    ) |>
    DT::formatRound(
      columns = setdiff(names(res$d_comp_pvt), "Program Area"),
      digits = 0
    )
}
Code
# compare ver vs v3 ----
res_v3 <- compare_scores(ver, "v3")
Wrote score comparison to ~/My Drive/projects/msens/data/derived/v6/tbl_pra_scores_v6_vs_v3.csv
Wrote pivot comparison to ~/My Drive/projects/msens/data/derived/v6/tbl_pra_scores_v6_vs_v3_pivot.xlsx
Code
if (!is.null(res_v3)) {
  res_v3$d_comp_wide |>
    DT::datatable(
      caption = glue(
        "Score differences ({res_v3$ver_curr} - {res_v3$ver_prev})"
      ),
      options = list(pageLength = 25, dom = "t")
    ) |>
    DT::formatRound(columns = res_v3$comp_cols, digits = 0)
}
Code
# pivot table: rows where scores differ (ver vs v3) ----
if (!is.null(res_v3) && !is.null(res_v3$d_comp_pvt)) {
  res_v3$d_comp_pvt |>
    DT::datatable(
      caption = glue(
        "Score pivot where different ({res_v3$ver_curr} vs {res_v3$ver_prev})"
      ),
      options = list(pageLength = 25, dom = "t", scrollX = TRUE)
    ) |>
    DT::formatRound(
      columns = setdiff(names(res_v3$d_comp_pvt), "Program Area"),
      digits = 0
    )
}

10.6 Multi-version comparison pivot

Code
# multi-version pivot: Score and Turtle across v3, v4, v5 ----
vers_compare <- c("v3", "v4", ver)
csvs <- glue("{dir_derived}/{vers_compare}/tbl_pra_scores_{vers_compare}.csv")
names(csvs) <- vers_compare

if (all(file_exists(csvs))) {
  # read all versions into one long table
  d_all <- imap_dfr(csvs, \(f, v) {
    read_csv(f, show_col_types = FALSE) |>
      rename(`Area (1,000 km2)` = `Area (1,000 km^2^)`) |>
      select(`Program Area`, Score, Turtle) |>
      mutate(ver = v)
  })

  # pivot wide: Score_v3, Score_v4, Score_v5, Turtle_v3, ...
  d_wide <- d_all |>
    pivot_wider(
      names_from = ver,
      values_from = c(Score, Turtle),
      names_glue = "{.value}_{ver}"
    )

  # add diff columns
  v_cur <- ver
  d_wide <- d_wide |>
    mutate(
      Score_v4_v3 = Score_v4 - Score_v3,
      "Score_{v_cur}_v3" := .data[[glue("Score_{v_cur}")]] - Score_v3,
      Turtle_v4_v3 = Turtle_v4 - Turtle_v3,
      "Turtle_{v_cur}_v3" := .data[[glue("Turtle_{v_cur}")]] - Turtle_v3
    ) |>
    select(
      `Program Area`,
      Score_v3,
      Score_v4,
      Score_v4_v3,
      !!glue("Score_{v_cur}"),
      !!glue("Score_{v_cur}_v3"),
      Turtle_v3,
      Turtle_v4,
      Turtle_v4_v3,
      !!glue("Turtle_{v_cur}"),
      !!glue("Turtle_{v_cur}_v3")
    ) |>
    arrange(desc(.data[[glue("Score_{v_cur}_v3")]]))

  # write xlsx
  multi_xlsx <- glue("{dir_v}/tbl_pra_scores_v3_vs_v4_{ver}.xlsx")
  write_xlsx(d_wide, multi_xlsx)
  message(glue("Wrote multi-version comparison to {multi_xlsx}"))

  # display
  num_cols <- setdiff(names(d_wide), "Program Area")
  d_wide |>
    DT::datatable(
      caption = glue(
        "Program Area scores: v3 vs v4 vs {ver} (Score and Turtle)"
      ),
      options = list(pageLength = 25, dom = "t", scrollX = TRUE)
    ) |>
    DT::formatRound(columns = num_cols, digits = 0) |>
    DT::formatStyle(
      columns = num_cols[str_detect(num_cols, "_v")],
      backgroundColor = DT::styleInterval(
        c(-0.5, 0.5),
        c("#f4cccc", "white", "#d9ead3")
      )
    )
}
Wrote multi-version comparison to ~/My Drive/projects/msens/data/derived/v6/tbl_pra_scores_v3_vs_v4_v6.xlsx

10.7 Program Area flower plots

Code
# pra list from gpkg (previously from PostGIS)
pra_all <- read_sf(pra_gpkg) |>
  st_drop_geometry() |>
  select(programarea_key, programarea_name)

# components
metric_input_keys <- c(
  glue("extrisk_{sp_cats}_ecoregion_rescaled"),
  "primprod_ecoregion_rescaled"
)
m_input_seqs <- tbl(con_sdm, "metric") |>
  filter(metric_key %in% metric_input_keys) |>
  pull(metric_seq)

# build components from metric table
components <- d_metric_labels |>
  filter(metric_key %in% flds, metric_title != "Score") |>
  pull(metric_title)
component_cols <- setNames(
  hue_pal()(length(components)),
  components
)

# d_scores
m_score <- "score_extriskspcat_primprod_ecoregionrescaled_equalweights"
m_key_score <- tbl(con_sdm, "metric") |>
  filter(metric_key == !!m_score) |>
  pull(metric_seq)

d_scores <- tbl(con_sdm, "zone") |>
  filter(
    tbl == !!tbl_pra,
    fld == "programarea_key",
    value %in% pra_all$programarea_key
  ) |>
  select(programarea_key = value, zone_seq) |>
  inner_join(
    tbl(con_sdm, "zone_metric") |>
      filter(metric_seq == !!m_key_score),
    by = join_by(zone_seq)
  ) |>
  select(programarea_key, score = value) |>
  mutate(score = round(score)) |>
  collect() |>
  arrange(desc(score), programarea_key) |>
  left_join(
    pra_all,
    by = "programarea_key"
  )

d_scores$programarea_lbl <- factor(
  str_wrap(d_scores$programarea_name, width = 20),
  levels = str_wrap(d_scores$programarea_name, width = 20),
  ordered = T
)

# d_fl
d_fl <- tbl(con_sdm, "zone") |>
  filter(
    tbl == !!tbl_pra,
    fld == "programarea_key",
    value %in% pra_all$programarea_key
  ) |>
  select(programarea_key = value, zone_seq) |>
  inner_join(
    tbl(con_sdm, "zone_metric"),
    by = "zone_seq"
  ) |>
  inner_join(
    tbl(con_sdm, "metric") |>
      filter(metric_seq %in% m_input_seqs) |>
      select(metric_seq, metric_key),
    by = "metric_seq"
  ) |>
  select(programarea_key, metric_key, value) |>
  arrange(programarea_key, metric_key) |>
  collect() |>
  mutate(
    component = metric_key |>
      str_replace("extrisk_", "") |>
      str_replace("_ecoregion_rescaled", "") |>
      str_replace("_", " ")
  ) |>
  select(programarea_key, component, value) |>
  arrange(programarea_key, component, value)

# add missing components with value 0
d_0 <- expand_grid(
  programarea_key = d_scores$programarea_key,
  component = sort(unique(d_fl$component)),
  value = 0
) |>
  anti_join(
    d_fl,
    by = join_by(programarea_key, component)
  )

d_fl <- d_fl |>
  bind_rows(d_0) |>
  arrange(programarea_key, component) |>
  mutate(even = 1)

# build component display name lookup from metric table
component_labels <- d_metric_labels |>
  filter(metric_key %in% flds, metric_title != "Score") |>
  mutate(
    component = metric_key |>
      str_replace("extrisk_", "") |>
      str_replace("_ecoregion_rescaled", "") |>
      str_replace("_", " ")
  ) |>
  pull(metric_title, name = component)

d_fl <- d_fl |>
  left_join(
    d_scores |> select(programarea_key, programarea_lbl),
    by = "programarea_key"
  ) |>
  mutate(
    component = component_labels[component]
  ) |>
  arrange(programarea_lbl, component)

# all flowers
d <- d_fl |>
  arrange(programarea_lbl, component) |>
  group_by(programarea_lbl) |>
  mutate(across(!where(is.character), as.double)) |>
  mutate(
    ymax = cumsum(even),
    ymin = lag(ymax, default = 0),
    xmax = value,
    xmin = 0
  )

for (theme_name in c("minimal", "linedraw", "min_line", "dark", "void")) {
  theme_fxn <- switch(
    theme_name,
    "minimal" = theme_minimal,
    "linedraw" = theme_linedraw,
    "min_line" = function(...) {
      theme_minimal(...) %+replace%
        theme(
          panel.grid = element_line(colour = "black"),
          panel.grid.major = element_line(linewidth = rel(0.1)),
          panel.grid.minor = element_line(linewidth = rel(0.05))
        )
    },
    "dark" = theme_dark,
    "void" = theme_void
  )

  g <- d |>
    ggplot() +
    geom_rect_interactive(
      aes(
        xmin = xmin,
        xmax = xmax,
        ymin = ymin,
        ymax = ymax,
        fill = component,
        color = "white",
        data_id = component
      ),
      color = "white",
      alpha = 1
    ) +
    scale_fill_manual(
      values = component_cols,
      name = "Component"
    ) +
    facet_wrap(~programarea_lbl) +
    coord_polar(theta = "y") +
    xlim(c(-20, max(d_scores$score, d$xmax))) +
    annotate(
      "rect",
      xmin = -Inf,
      xmax = 0,
      ymin = -Inf,
      ymax = Inf,
      fill = ifelse(theme_name == "dark", "grey20", "white"),
      color = NA
    ) +
    geom_text(
      aes(x, y, label = score),
      data = d_scores |> mutate(x = -20, y = 0),
      size = 2.5,
      fontface = "bold",
      color = ifelse(theme_name == "dark", "white", "black")
    ) +
    theme_fxn() +
    theme(
      legend.position = "bottom",
      legend.title = element_text(size = 8),
      legend.text = element_text(size = 6),
      plot.margin = unit(c(1, 1, 1, 1), "pt"),
      strip.text.x = element_text(size = 6),
      axis.title.x = element_blank(),
      axis.text.x = element_blank(),
      axis.ticks.x = element_blank(),
      axis.title.y = element_blank(),
      axis.text.y = element_blank(),
      axis.ticks.y = element_blank()
    )

  pra_fl_png <- glue(
    "{dir_figs}/programarea_flower-plot_scores_{theme_name}{v_sfx}.png"
  )
  pra_fl_pdf <- glue(
    "{dir_figs}/programarea_flower-plot_scores_{theme_name}{v_sfx}.pdf"
  )
  png(pra_fl_png, width = 1200, height = 800, res = 150)
  print(g)
  dev.off()
  pdf(pra_fl_pdf, width = 7, height = 5)
  print(g)
  dev.off()
}

10.8 Primary productivity by Program Area

Code
# zonal NPP from VGPM rasters, adapted from ingest_productivity.qmd pa_zonal_plot
dir_vgpm_yrs <- glue(
  "{dir_data}/raw/oregonstate.edu/vgpm.r2022.v.chl.v.sst.2160x4320"
)

pra <- read_sf(pra_gpkg)

r_yrs <- dir_ls(dir_vgpm_yrs, glob = "*.tif") |>
  rast()
names(r_yrs) <- glue("npp_{time(r_yrs)}")

d_pra_npp <- r_yrs |>
  zonal(
    pra |>
      select(programarea_key, programarea_name) |>
      vect(),
    fun = "mean",
    na.rm = T,
    as.polygons = T
  ) |>
  st_as_sf() |>
  st_drop_geometry() |>
  tibble() |>
  pivot_longer(
    cols = starts_with("npp_"),
    names_to = "year",
    values_to = "npp"
  ) |>
  filter(!is.na(npp)) |>
  mutate(
    year = str_replace(year, "npp_", "") |> as.integer(),
    npp = set_units(npp, mg / m^2 / day) |>
      set_units(t / km^2 / yr)
  ) |>
  group_by(programarea_key, programarea_name) |>
  summarize(
    npp_avg = mean(npp, na.rm = T),
    npp_sd = sd(npp, na.rm = T),
    .groups = "drop"
  ) |>
  arrange(desc(npp_avg)) |>
  mutate(
    npp_avg = npp_avg |> drop_units(),
    npp_sd = npp_sd
  ) |>
  mutate(
    programarea_name = factor(programarea_name, levels = programarea_name)
  )

write_csv(d_pra_npp, pp_csv)

g <- d_pra_npp |>
  ggplot(aes(x = programarea_name, y = npp_avg)) +
  geom_bar(
    position = position_dodge(),
    stat = "identity",
    fill = "darkgreen"
  ) +
  geom_errorbar(
    aes(ymin = npp_avg - npp_sd, ymax = npp_avg + npp_sd),
    width = .4
  ) +
  labs(
    title = NULL,
    x = NULL,
    y = expression("Areal NPP (metric tons C km"^-2 ~ "yr"^-1 * ")")
  ) +
  scale_y_continuous(expand = c(0, 20)) +
  theme(
    axis.text.x = element_text(
      angle = 45,
      vjust = 1,
      hjust = 1
    )
  )

png(pp_png, width = 1200, height = 800, res = 150)
print(g)
dev.off()
pdf(pp_pdf, width = 7, height = 5)
print(g)
dev.off()

10.9 Copy figures to dir_v

Code
# data files are already written directly to dir_v;
# copy rendered figures from local figs/ into dir_v
fig_files <- dir_ls(dir_figs, regexp = "\\.(png|pdf|docx|xlsx|csv)$")
for (f in fig_files) {
  file_copy(f, glue("{dir_v}/{path_file(f)}"), overwrite = T)
}

message(glue("Copied {length(fig_files)} figure files to {dir_v}"))
Copied 19 figure files to ~/My Drive/projects/msens/data/derived/v6

11 Generate README

Programmatically generate README.md and README.html for the versioned output directory. Uses data/derived_products.csv for file descriptions.

Code
dp_csv <- here("data/derived_products.csv")
stopifnot(file_exists(dp_csv))

# read and expand {ver} placeholders in output_file and dependencies
d_dp <- read_csv(dp_csv) |>
  rowwise() |>
  mutate(
    output_file = glue(output_file),
    dependencies = glue(dependencies)
  ) |>
  ungroup()

# list actual files in dir_v (non-recursive, exclude README itself)
v_files <- dir_ls(dir_v, recurse = F) |>
  path_file() |>
  str_subset("^README", negate = T) |>
  # exclude ~$ temp files from Word/Excel
  str_subset("^~\\$", negate = T)

# match files to descriptions
d_files <- tibble(file = v_files) |>
  left_join(
    d_dp |> select(file = output_file, description),
    by = "file"
  ) |>
  mutate(
    description = ifelse(is.na(description), "", description)
  ) |>
  arrange(file)

lns_files <- d_files |>
  knitr::kable(format = "pipe") |>
  paste(collapse = '\n')

# build README.md content
readme_lines <- c(
  glue(
    "
    ---
    format:
      html:
        embed-resources: true
    ---
    
    # Derived Products — {ver}
     
    Version: `{v_sfx}` | Generated: {Sys.Date()}
    
    ## File listing

    {lns_files}

    ## Naming conventions

    - `ply_*`: polygon spatial data (GeoPackage)
    - `r_*`: raster data (GeoTIFF)
    - `tbl_*`: tabular summary data (CSV) 
    - `layers_*`: layer metadata for metrics and species
    - `zone_taxon_*`: zone × taxon matrix
    - `*_{ver}`: version {ver} model outputs
    
    ## Large files
    
    Some files exceed GitHub size limits and are stored in the 
    `big-files/{ver}/` directory (not tracked by git). 
    See `sdm_parquet/` for parquet exports.
    
    _Auto-generated by `calc_scores.qmd` on {Sys.time()}_"
  )
)

readme_md <- glue("{dir_v}/README.md")
writeLines(readme_lines, readme_md)
message(glue("Wrote {readme_md}"))
Wrote ~/My Drive/projects/msens/data/derived/v6/README.md
Code
# render to HTML
quarto_render(readme_md |> path_real())
pandoc 
  to: html
  output-file: README.html
  standalone: true
  embed-resources: true
  section-divs: true
  html-math-method: mathjax
  wrap: none
  default-image-extension: png
  variables: {}
  
metadata
  document-css: false
  link-citations: true
  date-format: long
  lang: en
  
Output created: README.html
Code
message(glue("Rendered {readme_md} to HTML"))
Rendered ~/My Drive/projects/msens/data/derived/v6/README.md to HTML

12 Map score cells

Code
librarian::shelf(mapgl)

# load global raster template
# r_global <- readRDS(glue("{dir_data}/derived/r_global_0.05.rds"))
cell_tif <- glue("{dir_data}/derived/r_bio-oracle_planarea.tif")

r_cell <- rast(cell_tif)

# d_metric <- tbl(con_sdm, "metric") |> collect()
# View(d_metric)
# TODO: add sequences for date_created post-export-parquet

# get species richness metric ID
# m_key <- "extrisk_all_ecoregion_rescaled"
m_key <- "score_extriskspcat_primprod_ecoregionrescaled_equalweights"
m_seq <- tbl(con_sdm, "metric") |>
  filter(metric_key == !!m_key) |>
  pull(metric_seq)

# query species richness for a region
d_m <- dbGetQuery(
  con_sdm,
  glue(
    "
  SELECT
    cm.cell_id,
    cm.value
  FROM cell_metric cm
  WHERE cm.metric_seq = (
    SELECT metric_seq
    FROM metric
    WHERE metric_key = '{m_key}' )"
  )
) |>
  tibble()
stopifnot(sum(duplicated(d_m$cell_id)) == 0)

# convert to raster
r_m <- create_raster_from_df(d_m, r_cell, value_col = "value")

# display with mapgl
# TODO: make into function in msens R package
mapgl_rast <- function(
  r,
  cols_r = rev(RColorBrewer::brewer.pal(11, "Spectral")),
  layer_name = "SDM"
) {
  # r = r_m; cols_r = cols_r; layer_name = m_key

  rng_r <- minmax(r) |> as.numeric() |> signif(digits = 2)

  mapboxgl(
    style = mapbox_style("dark"),
    zoom = 3.5,
    center = c(-106, 40.1)
  ) |> # ply_pa centroid
    add_image_source(
      id = "r_src",
      data = r,
      colors = cols_r
    ) |>
    add_raster_layer(
      id = "r_lyr",
      source = "r_src",
      raster_resampling = "nearest"
    ) |>
    mapgl::add_legend(
      layer_name,
      values = rng_r,
      colors = cols_r,
      position = "bottom-right"
    ) |>
    add_fullscreen_control(position = "top-left") |>
    add_navigation_control() |>
    add_scale_control()
}

# visualize
mapgl_rast(r_m, layer_name = m_key)

13 Parquet export/import

This cleans up the DuckDB database by exporting tables to Parquet files (with sort order for faster reads) and then re-importing them to a new DuckDB database. This also allows us to remove indexes from the DuckDB database, which are not needed for the mapgl app and can slow down writes.

13.1 Export to Parquet

Code
# TODO: model_cell: PARTITION BY (mdl_seq);") # https://duckdb.org/docs/stable/data/parquet/overview

source(here("libs/sdm_functions.R"))
drv_export <- duckdb(dbdir = sdm_db, read_only = FALSE)
con_sdm <- dbConnect(drv_export) |>
  load_duckdb_extensions()

# dbListTables(con_sdm)
dbExecute(con_sdm, "SET threads = 7;")
[1] 0
Code
dbExecute(con_sdm, "SET memory_limit = '20GB';")
[1] 0
Code
dir_pq <- normalizePath(dir_pq)
if (dir_exists(dir_pq)) {
  dir_delete(dir_pq)
}
dir_create(dir_pq)

# OLD: add primary keys, indexes
# NEW: skip indexes, sort on export, per [performance](https://duckdb.org/docs/stable/guides/performance/indexing.html)

# dbRemoveTable(con_sdm, "cell_model", force = T)
tbl_keys <- list(
  cell = "cell_id",
  cell_metric = c("cell_id", "metric_seq"),
  # cell_model  = c("cell_id", "mdl_seq"),   # source: model_cell; alternative key sorting order
  dataset = "ds_key",
  listing = "spp_id",
  metric = "metric_seq",
  model = "mdl_seq",
  model_cell = c("mdl_seq", "cell_id"), # *big* one
  species = "sp_seq",
  taxon = "taxon_id",
  taxon_model = c("taxon_id", "ds_key"),
  zone = "zone_seq",
  zone_cell = c("zone_seq", "cell_id"),
  zone_metric = c("zone_seq", "metric_seq"),
  zone_taxon = c("zone_fld", "zone_value", "mdl_seq", "taxon_id")
)
stopifnot(length(setdiff(names(tbl_keys), dbListTables(con_sdm))) == 0)
stopifnot(length(setdiff(dbListTables(con_sdm), names(tbl_keys))) == 0)

# export parquet files with sort order
for (tbl in names(tbl_keys)) {
  # tbl = names(tbl_keys)[1]  # tbl = "cell_model"
  keys <- tbl_keys[[tbl]]
  keys_str <- paste(keys, collapse = ', ')
  pq_file <- glue("{dir_pq}/{tbl}.parquet")

  message(glue(
    "Exporting {tbl} (ORDER BY {keys_str}) to parquet  ~  {Sys.time()}"
  ))

  fro_tbl <- tbl
  if (tbl == "cell_model") {
    fro_tbl <- "model_cell"
  }
  stopifnot(all(keys %in% dbListFields(con_sdm, fro_tbl)))

  if (tbl %in% c("cell_model", "model_cell")) {
    flds <- dbListFields(con_sdm, fro_tbl) |>
      setdiff(keys)
    flds_str <- paste(flds, collapse = '\", \"')
    sql <- glue(
      "COPY (
      SELECT {keys_str}, {flds_str} FROM {fro_tbl} 
      ORDER BY {keys_str}) 
      TO '{pq_file}' 
    (FORMAT parquet, PER_THREAD_OUTPUT, COMPRESSION zstd) "
    )
  } else {
    sql <- glue(
      "COPY (
      SELECT * FROM {fro_tbl} 
      ORDER BY {keys_str}) 
      TO '{pq_file}' 
    (FORMAT parquet, PER_THREAD_OUTPUT, COMPRESSION zstd) "
    )
  }
  message(sql)
  dbExecute(con_sdm, sql)
}
Exporting cell (ORDER BY cell_id) to parquet  ~  2026-03-25 15:20:38.625874
COPY (
  SELECT * FROM cell 
  ORDER BY cell_id) 
  TO '/Users/bbest/_big/msens/derived/v6/sdm_parquet/cell.parquet' 
(FORMAT parquet, PER_THREAD_OUTPUT, COMPRESSION zstd) 
Exporting cell_metric (ORDER BY cell_id, metric_seq) to parquet  ~  2026-03-25 15:20:39.620685
COPY (
  SELECT * FROM cell_metric 
  ORDER BY cell_id, metric_seq) 
  TO '/Users/bbest/_big/msens/derived/v6/sdm_parquet/cell_metric.parquet' 
(FORMAT parquet, PER_THREAD_OUTPUT, COMPRESSION zstd) 
Exporting dataset (ORDER BY ds_key) to parquet  ~  2026-03-25 15:20:42.057256
COPY (
  SELECT * FROM dataset 
  ORDER BY ds_key) 
  TO '/Users/bbest/_big/msens/derived/v6/sdm_parquet/dataset.parquet' 
(FORMAT parquet, PER_THREAD_OUTPUT, COMPRESSION zstd) 
Exporting listing (ORDER BY spp_id) to parquet  ~  2026-03-25 15:20:42.079975
COPY (
  SELECT * FROM listing 
  ORDER BY spp_id) 
  TO '/Users/bbest/_big/msens/derived/v6/sdm_parquet/listing.parquet' 
(FORMAT parquet, PER_THREAD_OUTPUT, COMPRESSION zstd) 
Exporting metric (ORDER BY metric_seq) to parquet  ~  2026-03-25 15:20:42.096065
COPY (
  SELECT * FROM metric 
  ORDER BY metric_seq) 
  TO '/Users/bbest/_big/msens/derived/v6/sdm_parquet/metric.parquet' 
(FORMAT parquet, PER_THREAD_OUTPUT, COMPRESSION zstd) 
Exporting model (ORDER BY mdl_seq) to parquet  ~  2026-03-25 15:20:42.1101
COPY (
  SELECT * FROM model 
  ORDER BY mdl_seq) 
  TO '/Users/bbest/_big/msens/derived/v6/sdm_parquet/model.parquet' 
(FORMAT parquet, PER_THREAD_OUTPUT, COMPRESSION zstd) 
Exporting model_cell (ORDER BY mdl_seq, cell_id) to parquet  ~  2026-03-25 15:20:42.190137
COPY (
  SELECT mdl_seq, cell_id, value FROM model_cell 
  ORDER BY mdl_seq, cell_id) 
  TO '/Users/bbest/_big/msens/derived/v6/sdm_parquet/model_cell.parquet' 
(FORMAT parquet, PER_THREAD_OUTPUT, COMPRESSION zstd) 
Exporting species (ORDER BY sp_seq) to parquet  ~  2026-03-25 15:22:51.84197
COPY (
  SELECT * FROM species 
  ORDER BY sp_seq) 
  TO '/Users/bbest/_big/msens/derived/v6/sdm_parquet/species.parquet' 
(FORMAT parquet, PER_THREAD_OUTPUT, COMPRESSION zstd) 
Exporting taxon (ORDER BY taxon_id) to parquet  ~  2026-03-25 15:22:53.105491
COPY (
  SELECT * FROM taxon 
  ORDER BY taxon_id) 
  TO '/Users/bbest/_big/msens/derived/v6/sdm_parquet/taxon.parquet' 
(FORMAT parquet, PER_THREAD_OUTPUT, COMPRESSION zstd) 
Exporting taxon_model (ORDER BY taxon_id, ds_key) to parquet  ~  2026-03-25 15:22:53.242183
COPY (
  SELECT * FROM taxon_model 
  ORDER BY taxon_id, ds_key) 
  TO '/Users/bbest/_big/msens/derived/v6/sdm_parquet/taxon_model.parquet' 
(FORMAT parquet, PER_THREAD_OUTPUT, COMPRESSION zstd) 
Exporting zone (ORDER BY zone_seq) to parquet  ~  2026-03-25 15:22:53.30985
COPY (
  SELECT * FROM zone 
  ORDER BY zone_seq) 
  TO '/Users/bbest/_big/msens/derived/v6/sdm_parquet/zone.parquet' 
(FORMAT parquet, PER_THREAD_OUTPUT, COMPRESSION zstd) 
Exporting zone_cell (ORDER BY zone_seq, cell_id) to parquet  ~  2026-03-25 15:22:53.342024
COPY (
  SELECT * FROM zone_cell 
  ORDER BY zone_seq, cell_id) 
  TO '/Users/bbest/_big/msens/derived/v6/sdm_parquet/zone_cell.parquet' 
(FORMAT parquet, PER_THREAD_OUTPUT, COMPRESSION zstd) 
Exporting zone_metric (ORDER BY zone_seq, metric_seq) to parquet  ~  2026-03-25 15:22:53.476556
COPY (
  SELECT * FROM zone_metric 
  ORDER BY zone_seq, metric_seq) 
  TO '/Users/bbest/_big/msens/derived/v6/sdm_parquet/zone_metric.parquet' 
(FORMAT parquet, PER_THREAD_OUTPUT, COMPRESSION zstd) 
Exporting zone_taxon (ORDER BY zone_fld, zone_value, mdl_seq, taxon_id) to parquet  ~  2026-03-25 15:22:53.498206
COPY (
  SELECT * FROM zone_taxon 
  ORDER BY zone_fld, zone_value, mdl_seq, taxon_id) 
  TO '/Users/bbest/_big/msens/derived/v6/sdm_parquet/zone_taxon.parquet' 
(FORMAT parquet, PER_THREAD_OUTPUT, COMPRESSION zstd) 
Code
message(glue("Exporting finished  ~  {Sys.time()}")) # ~ 1 min
Exporting finished  ~  2026-03-25 15:22:53.641652
Code
dbDisconnect(con_sdm)
rm(con_sdm)
duckdb::duckdb_shutdown(drv_export)
rm(drv_export)
gc()
           used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
Ncells  5260109 281.0    8732378 466.4         NA  8732378 466.4
Vcells 14601778 111.5   46615312 355.7      24576 72354749 552.1

13.2 Import from Parquet

Code
# dir_pq and sdm_db defined in setup chunk

# close any existing connection
if (exists("con_sdm") && dbIsValid(con_sdm)) {
  dbDisconnect(con_sdm)
}
if (exists("con_sdm")) {
  rm(con_sdm)
}
gc()
           used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
Ncells  5259409 280.9    8732378 466.4         NA  8732378 466.4
Vcells 14601879 111.5   46615312 355.7      24576 72354749 552.1
Code
Sys.sleep(1)

if (file_exists(sdm_db)) {
  file_delete(sdm_db)
}
# also remove stale WAL file
sdm_wal <- paste0(sdm_db, ".wal")
if (file_exists(sdm_wal)) {
  file_delete(sdm_wal)
}

source(here("libs/sdm_functions.R"))
# create a fresh database and immediately checkpoint to ensure the main file
# exists on disk (duckdb WAL-only mode never creates the .duckdb file otherwise)
drv_import <- duckdb(dbdir = sdm_db, read_only = FALSE)
con_sdm <- dbConnect(drv_import) |>
  load_duckdb_extensions()
dbExecute(con_sdm, "CHECKPOINT")
[1] 0
Code
stopifnot(
  `sdm_db not created after initial CHECKPOINT` = file_exists(sdm_db)
)

# dbExecute(con_sdm, "SET threads = 8;")
# dbExecute(con_sdm, "SET memory_limit = '20GB';")

# import parquet ----
for (pq in dir_ls(dir_pq, glob = "*.parquet")) {
  # pq = dir_ls(dir_pq, glob = "*.parquet")[2]
  tbl <- path_file(pq) |> str_remove("\\.parquet$")
  if (!dbExistsTable(con_sdm, tbl)) {
    message(glue("Importing {tbl} from {basename(pq)}/*.parquet"))
    dbExecute(
      con_sdm,
      glue("CREATE TABLE {tbl} AS SELECT * FROM '{pq}/*.parquet'")
    )
  }
}
Importing cell from cell.parquet/*.parquet
Importing cell_metric from cell_metric.parquet/*.parquet
Importing dataset from dataset.parquet/*.parquet
Importing listing from listing.parquet/*.parquet
Importing metric from metric.parquet/*.parquet
Importing model from model.parquet/*.parquet
Importing model_cell from model_cell.parquet/*.parquet
Importing species from species.parquet/*.parquet
Importing taxon from taxon.parquet/*.parquet
Importing taxon_model from taxon_model.parquet/*.parquet
Importing zone from zone.parquet/*.parquet
Importing zone_cell from zone_cell.parquet/*.parquet
Importing zone_metric from zone_metric.parquet/*.parquet
Importing zone_taxon from zone_taxon.parquet/*.parquet
Code
# create index on 2nd column of multi-column index from parquet export
# tbl_idx <- list(
#   cell_metric = "metric_seq",
#   model_cell  = "mdl_seq",
#   zone_cell   = "cell_id",
#   zone_metric = "metric_seq")
# for (tbl in names(tbl_idx)){      # tbl = "model_cell"
#   fld <- tbl_idx[[tbl]]
#   idx <- glue("idx_{tbl}_{fld}")  # idx_model_cell_mdl_seq
#   stopifnot(fld %in% dbListFields(con_sdm, tbl))
#
#   sql <- glue("CREATE INDEX IF NOT EXISTS {idx} ON {tbl} ({fld});")
#   message(sql)
#   dbExecute(con_sdm, sql)
# }

# sequences ----
# for auto-incrementing primary keys
tbl_seq <- list(
  metric = "metric_seq",
  model = "mdl_seq",
  species = "sp_seq",
  zone = "zone_seq"
)
stopifnot(all(names(tbl_seq) %in% dbListTables(con_sdm)))

for (tbl in names(tbl_seq)) {
  # tbl = "metric"
  fld <- tbl_seq[[tbl]]
  id <- glue("seq_{tbl}")

  nxt <- dbGetQuery(con_sdm, glue("SELECT MAX({fld}) FROM {tbl}")) |>
    pull(1) +
    1

  dbExecute(con_sdm, glue("DROP SEQUENCE IF EXISTS {id}"))
  sql <- glue("CREATE SEQUENCE {id} START {nxt}")
  message(sql)
  dbExecute(con_sdm, sql)
  sql <- glue(
    "ALTER TABLE {tbl} ALTER COLUMN {fld} SET DEFAULT nextval('{id}')"
  )
  message(sql)
  dbExecute(con_sdm, sql)
}
CREATE SEQUENCE seq_metric START 948
ALTER TABLE metric ALTER COLUMN metric_seq SET DEFAULT nextval('seq_metric')
CREATE SEQUENCE seq_model START 63662
ALTER TABLE model ALTER COLUMN mdl_seq SET DEFAULT nextval('seq_model')
CREATE SEQUENCE seq_species START 63657
ALTER TABLE species ALTER COLUMN sp_seq SET DEFAULT nextval('seq_species')
CREATE SEQUENCE seq_zone START 250
ALTER TABLE zone ALTER COLUMN zone_seq SET DEFAULT nextval('seq_zone')
Code
# date_created ----
# default to current_date
for (tbl in dbListTables(con_sdm)) {
  # tbl = "dataset"
  fld <- dbListFields(con_sdm, tbl) |>
    str_subset("date_created")
  if (length(fld) == 1) {
    sql <- glue(
      "ALTER TABLE {tbl} ALTER COLUMN date_created SET DEFAULT CURRENT_DATE;"
    )
    message(sql)
    a <- dbExecute(con_sdm, sql)
  }
}
ALTER TABLE dataset ALTER COLUMN date_created SET DEFAULT CURRENT_DATE;
ALTER TABLE metric ALTER COLUMN date_created SET DEFAULT CURRENT_DATE;
ALTER TABLE model ALTER COLUMN date_created SET DEFAULT CURRENT_DATE;
ALTER TABLE zone ALTER COLUMN date_created SET DEFAULT CURRENT_DATE;
Code
# checkpoint WAL to main DB file, then shut down cleanly
dbExecute(con_sdm, "CHECKPOINT")
[1] 0
Code
dbDisconnect(con_sdm)
rm(con_sdm)
duckdb::duckdb_shutdown(drv_import)
rm(drv_import)
gc()
           used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
Ncells  5260333 281.0    8732378 466.4         NA  8732378 466.4
Vcells 14603400 111.5   46615312 355.7      24576 72354749 552.1
Code
Sys.sleep(1)
stopifnot(
  `sdm_db missing after import_parquet CHECKPOINT+shutdown` = file_exists(
    sdm_db
  )
)
message(glue(
  "sdm.duckdb exists: {file_exists(sdm_db)}  size: {file_size(sdm_db)}"
))
sdm.duckdb exists: TRUE  size: 3.24G

14 Deploy to server

Sync derived data and app code to the msens server.

Code
deploy_sh <- here("libs/deploy_to_server.sh")
stopifnot(file_exists(deploy_sh))

# duckdb shutdown may need a moment for file handles to release
for (i in 1:5) {
  if (file_exists(sdm_db)) {
    break
  }
  gc() # flush any orphaned duckdb driver finalizers
  Sys.sleep(2)
  message(glue("waiting for sdm_db ({i}/5)..."))
}
stopifnot(
  `sdm_db not found after parquet import — check import_parquet chunk` = file_exists(
    sdm_db
  )
)
system2("bash", args = c(deploy_sh, ver), wait = T)

15 Close Connection

Code
if (exists("con_sdm") && dbIsValid(con_sdm)) {
  dbDisconnect(con_sdm, shutdown = T)
}