Published

2025-12-09 15:04:35

1 SDM Validation per Ecoregion by Occurrence

1.1 Overview

Get species of interest from the Marine Sensitivity Database and validate ecoregions using GBIF and WoRMS occurence data.

Process:

  • Get raster for species of interest from Marine Sensitivity Database
  • Get occurrence data from GBIF and WoRMS for those species
  • Compare presence of species in Ecoregions for SDM vs occurrence data using different filters
  • Determine the best filter to systematically apply that maximizes matches between SDM and occurrence data, while minimizing mismatches (additions and subtractions)

TODO:

  • Buffer 10 km to include Rice’s whale occurrences

1.2 Species of Interest

Walrus, North Atlantic Right Whale, Humpback Whale, Fin Whale, Atlantic Sturgeon, Grey Seal, Atlantic Cod, Common Dolphin, Atlantic Lobster, Bowhead Whale, Black-capped Petrel. Common Tern, Calanus finmarchicus, Meganyctiphanes norvegica, Rice’s Whale

Code
librarian::shelf(
  aws.s3,
  DBI,
  DT,
  dplyr,
  duckdb,
  glue,
  h3jsr,
  here,
  jsonlite,
  mapgl,
  mapview,
  purrr,
  readr,
  rgbif,
  sf,
  stringr,
  terra,
  tidyr,
  worrms,
  quiet = T
)
Warning: package 'mapgl' was built under R version 4.5.2
Code
verbose <- T
is_server <- Sys.info()[["sysname"]] == "Linux"
dir_private <- ifelse(
  is_server,
  "/share/private",
  "~/My Drive/private"
)
dir_data <- ifelse(
  is_server,
  "/share/data",
  "~/My Drive/projects/msens/data"
)
mapbox_tkn_txt <- glue("{dir_private}/mapbox_token_bdbest.txt")
maptile_tkn_txt <- glue("{dir_private}/mapbox_token_bdbest.txt")
cell_tif <- glue("{dir_data}/derived/r_bio-oracle_planarea.tif")
sdm_db <- glue("{dir_data}/derived/sdm.duckdb")
spp15_csv <- here("data/sdm_occ_spp15.csv")
er_gpkg <- glue("{dir_data}/derived/downloads/ply_ecoregions_2025.gpkg")
h3_rds <- here("data/sdm_occ_validation/worms_h3_cells.rds")

# db ----
con_sdm <- dbConnect(duckdb(), dbdir = sdm_db, read_only = T)

# data prep ----
r_cell <- rast(cell_tif)

get_mdl_rast <- function(mdl_seq) {
  # mdl_seq = 3

  d <- tbl(con_sdm, "model_cell") |>
    filter(mdl_seq == !!mdl_seq) |>
    select(cell_id, value) |>
    collect()

  r <- init(r_cell[[1]], NA)
  r[d$cell_id] <- d$value
  names(r) <- "value"

  r
}

get_mdl_ecoregions <- function(mdl_seq) {
  # mdl_seq = 3
  tbl(con_sdm, "model_cell") |>
    filter(
      mdl_seq == !!mdl_seq,
      !is.na(value),
      value > 0
    ) |>
    select(cell_id) |>
    left_join(
      tbl(con_sdm, "zone_cell") |>
        select(cell_id, zone_seq),
      by = join_by(cell_id)
    ) |>
    left_join(
      tbl(con_sdm, "zone") |>
        filter(tbl == "ply_ecoregions_2025") |>
        select(zone_seq, er_key = value),
      by = join_by(zone_seq)
    ) |>
    filter(!is.na(er_key)) |>
    group_by(er_key) |>
    summarise(n_sdm_cell = n()) |>
    collect()
}
Code
if (!file.exists(spp15_csv)) {
  d_spp15 <- tribble(
    ~sp_sci                     , ~sp_cmn                      ,
    "Odobenus rosmarus"         , "Walrus"                     ,
    "Eubalaena glacialis"       , "North Atlantic Right Whale" ,
    "Megaptera novaeangliae"    , "Humpback Whale"             ,
    "Balaenoptera physalus"     , "Fin Whale"                  ,
    "Acipenser oxyrinchus"      , "Atlantic Sturgeon"          ,
    "Halichoerus grypus"        , "Grey Seal"                  ,
    "Gadus morhua"              , "Atlantic Cod"               ,
    "Delphinus delphis"         , "Common Dolphin"             ,
    "Homarus americanus"        , "Atlantic Lobster"           ,
    "Balaena mysticetus"        , "Bowhead Whale"              ,
    "Pterodroma hasitata"       , "Black-capped Petrel"        ,
    "Sterna hirundo"            , "Common Tern"                ,
    "Calanus finmarchicus"      , "Calanus finmarchicus"       ,
    "Meganyctiphanes norvegica" , "Meganyctiphanes norvegica"  ,
    "Balaenoptera ricei"        , "Rice's Whale"
  ) |>
    relocate(sp_cmn) |>
    arrange(sp_cmn) |>
    mutate(
      worms_id = map_int(sp_sci, \(x) {
        worrms::wm_name2id(x) |>
          as.integer()
      }),
      gbif_id = map_int(sp_sci, \(x) {
        rgbif::name_backbone(x) |>
          pull(usageKey) |>
          as.integer()
      }),
      mdl_seq = map_int(sp_sci, \(x) {
        tbl(con_sdm, "taxon") |>
          filter(is_ok) |>
          filter(scientific_name == x) |>
          pull(mdl_seq)
      })
    )
  write_csv(d_spp15, spp15_csv)
} else {
  d_spp15 <- read_csv(spp15_csv, show_col_types = F)
}

d_spp15 |>
  mutate(
    worms_id = glue(
      "<a href='https://www.marinespecies.org/aphia.php?p=taxdetails&id={worms_id}' target='_blank'>{worms_id}</a>"
    ),
    gbif_id = glue(
      "<a href='https://www.gbif.org/species/{gbif_id}' target='_blank'>{gbif_id}</a>"
    ),
    mdl_seq = glue(
      "<a href='https://shiny.marinesensitivity.org/mapsp/?mdl_seq={mdl_seq}' target='_blank'>{mdl_seq}</a>"
    )
  ) |>
  datatable(
    caption = "Species of interest with IDs from WoRMS (used by OBIS), GBIF and the MarineSensitivity models (`mdl_seq`)",
    escape = F
  )

1.3 Get Ecoregions from Marine Sensitivity Database

Code
ply_er <- read_sf(er_gpkg) |>
  rename(er_key = ecoregion_key)

d_spp_er <- d_spp15 |>
  mutate(
    ecoregions = map(mdl_seq, get_mdl_ecoregions)
  ) |>
  select(sp_cmn, worms_id, ecoregions) |>
  unnest(ecoregions) |>
  arrange(sp_cmn, er_key)

1.4 Get Occurrence Data from OBIS / WoRMS

  • iobis/speciesgrids: gridded datasets of WoRMS aligned marine species distributions as GeoParquet based on the OBIS and GBIF occurrence snapshots. The package currently supports Geohash and H3 grid output.
Code
# Set up duckdb connection and extensions
con <- dbConnect(duckdb())
dbSendQuery(con, "INSTALL httpfs; LOAD httpfs;")
<duckdb_result 3e4e0 connection=74790 statement='INSTALL httpfs; LOAD httpfs;'>
Code
dbSendQuery(con, "INSTALL spatial; LOAD spatial;")
<duckdb_result 945b0 connection=74790 statement='INSTALL spatial; LOAD spatial;'>
Code
dbSendQuery(con, "INSTALL h3 FROM community; LOAD h3;")
<duckdb_result 85dd0 connection=74790 statement='INSTALL h3 FROM community; LOAD h3;'>
Code
# Query
# species <- dbGetQuery(
#   con,
#   glue(
#     "
#   select kingdom, phylum, class, family, genus, species, AphiaID
#   from read_parquet('s3://obis-products/speciesgrids/h3_7/*')
#   where ST_Intersects(geometry, ST_GeomFromText('{wkt}'))
#   group by kingdom, phylum, class, family, genus, species, AphiaID
# "
#   )
# )

# d_op <- get_bucket_df(
#   's3://obis-products',
#   max = Inf
# )

# d_sg <- get_bucket_df(
#   's3://obis-products',
#   prefix = 'speciesgrids/',
#   max = Inf
# )
# which(d_sg$Key == "speedy/distribution_137077.geojson")

# pq <- "s3://obis-products/speciesgrids/h3_7/*"

# column_info <- dbGetQuery(
#   con,
#   glue(
#     "DESCRIBE SELECT * FROM read_parquet('{pq}');"
#   )
# )
# print(column_info)

# get all cells for species with AphiaID 156263
# worms_id <- 137077 # _Odobenus rosmarus_ (walrus)
# Fix the query to use h3_cell_to_boundary_wkt instead of h3_cell_to_boundary_wkb
# d_sp_h3 <- dbGetQuery(
#   con,
#   glue(
#     "select *, h3_cell_to_boundary_wkt(cell) AS geometry_wkt
#     from read_parquet('{pq}')
#     where AphiaID = {aphia_id}"
#   )
# )

for (i in 1:nrow(d_spp15)) {
  # i = 1
  worms_id <- d_spp15$worms_id[i]
  sp_cmn <- d_spp15$sp_cmn[i]

  h3_csv <- here(glue("data/sdm_occ_validation/h3_cells_worms-{worms_id}.csv"))

  if (!file.exists(h3_csv)) {
    message(glue(
      "Querying {i}/15: {sp_cmn} (worms_id: {worms_id}) ~ {Sys.Date()}"
    ))

    d_h3 <- dbGetQuery(
      con,
      glue(
        "SELECT cell AS h3_cell, records, min_year, max_year, source_obis, source_gbif
        from read_parquet('s3://obis-products/speciesgrids/h3_7/*') 
        where AphiaID = {worms_id}"
      )
    )
    write_csv(d_h3, h3_csv)
  }
}

if (!file.exists(h3_rds)) {
  ply_h3 <- tibble(
    csv = list.files(
      here("data/sdm_occ_validation"),
      pattern = "h3_cells_worms-.*\\.csv$",
      full.names = T
    )
  ) |>
    mutate(
      worms_id = str_extract(basename(csv), "(?<=worms-)[0-9]+") |>
        as.integer(),
      data = map(csv, read_csv, show_col_types = F),
    ) |>
    select(-csv) |>
    arrange(worms_id) |>
    unnest(data) |>
    mutate(
      geom = cell_to_polygon(h3_cell, simple = T)
    ) |>
    st_as_sf()

  write_rds(ply_h3, h3_rds)
}

ply_h3 <- read_rds(h3_rds)

1.5 Map of ecoregions with walrus occurrence h3 hexagons

Code
ply_h3_walrus <- ply_h3 |>
  filter(worms_id == 137077) |>
  mutate(
    popup = glue(
      "<strong>H3 Cell:</strong> {h3_cell}<br/>
      <strong>Records:</strong> {records}<br/>
      <strong>Min Year:</strong> {min_year}<br/>
      <strong>Max Year:</strong> {max_year}<br/>
      <strong>Source OBIS:</strong> {source_obis}<br/>
      <strong>Source GBIF:</strong> {source_gbif}"
    )
  )

maplibre(
  style = maptiler_style("bright", variant = "dark"),
  zoom = 3.5,
  center = c(-106, 40.1)
) |>
  add_vector_source(
    id = "er_src",
    url = "https://api.marinesensitivity.org/tilejson?table=public.ply_ecoregions_2025"
  ) |>
  add_line_layer(
    id = "er_ln",
    source = "er_src",
    source_layer = "public.ply_ecoregions_2025",
    line_color = "gray",
    line_opacity = 1,
    line_width = 3
  ) |>
  add_fill_layer(
    id = "walrus_h3",
    source = ply_h3_walrus,
    fill_color = "white",
    fill_opacity = 0.5,
    popup = "popup"
  ) |>
  add_fullscreen_control() |>
  add_navigation_control() |>
  add_scale_control() |>
  add_geocoder_control()

1.6 Map of ecoregions with Rice’s whale occurrence h3 hexagons

Code
ply_h3_ricei <- ply_h3 |>
  filter(worms_id == 1576133) |>
  mutate(
    popup = glue(
      "<strong>H3 Cell:</strong> {h3_cell}<br/>
      <strong>Records:</strong> {records}<br/>
      <strong>Min Year:</strong> {min_year}<br/>
      <strong>Max Year:</strong> {max_year}<br/>
      <strong>Source OBIS:</strong> {source_obis}<br/>
      <strong>Source GBIF:</strong> {source_gbif}"
    )
  )

maplibre(
  style = maptiler_style("bright", variant = "dark"),
  zoom = 3.5,
  center = c(-106, 40.1)
) |>
  add_vector_source(
    id = "er_src",
    url = "https://api.marinesensitivity.org/tilejson?table=public.ply_ecoregions_2025"
  ) |>
  add_line_layer(
    id = "er_ln",
    source = "er_src",
    source_layer = "public.ply_ecoregions_2025",
    line_color = "gray",
    line_opacity = 1,
    line_width = 3
  ) |>
  add_fill_layer(
    id = "ricei_h3",
    source = ply_h3_ricei,
    fill_color = "white",
    fill_opacity = 0.5,
    popup = "popup"
  ) |>
  add_fullscreen_control() |>
  add_navigation_control() |>
  add_scale_control() |>
  add_geocoder_control()

1.7 Apply Filters to Occurrence Data

Code
# apply
d_filt0 <- ply_h3 |>
  # unfiltered, regardless of source (obis/gbif) or year (even has min_year)
  select(worms_id, h3_cell) |>
  st_join(
    ply_er |>
      select(er_key)
  ) |>
  filter(!is.na(er_key)) |>
  st_drop_geometry() |>
  group_by(worms_id, er_key) |>
  summarise(
    n_occ_hex = n(),
    .groups = "drop"
  ) # 75 rows

d_filt1 <- ply_h3 |>
  # source_obis
  filter(
    source_obis == T
  ) |>
  select(worms_id, h3_cell) |>
  st_join(
    ply_er |>
      select(er_key)
  ) |>
  filter(!is.na(er_key)) |>
  st_drop_geometry() |>
  group_by(worms_id, er_key) |>
  summarise(
    n_occ_hex = n(),
    .groups = "drop"
  ) # 69 rows

d_filt2 <- ply_h3 |>
  # source_obis, has min_year data
  filter(
    source_obis == T,
    !is.na(min_year)
  ) |>
  select(worms_id, h3_cell) |>
  st_join(
    ply_er |>
      select(er_key)
  ) |>
  filter(!is.na(er_key)) |>
  st_drop_geometry() |>
  group_by(worms_id, er_key) |>
  summarise(
    n_occ_hex = n(),
    .groups = "drop"
  ) # 62 rows

d_filt3 <- ply_h3 |>
  # source_obis, min_year >= 1975 (last 50 years)
  filter(
    source_obis == T,
    !is.na(min_year),
    min_year >= 1975
  ) |>
  select(worms_id, h3_cell) |>
  st_join(
    ply_er |>
      select(er_key)
  ) |>
  filter(!is.na(er_key)) |>
  st_drop_geometry() |>
  group_by(worms_id, er_key) |>
  summarise(
    n_occ_hex = n(),
    .groups = "drop"
  ) # 55 rows

d_filt4 <- ply_h3 |>
  # source_obis, min_year >= 2000 (last 25 years)
  filter(
    source_obis == T,
    !is.na(min_year),
    min_year >= 2000
  ) |>
  select(worms_id, h3_cell) |>
  st_join(
    ply_er |>
      select(er_key)
  ) |>
  filter(!is.na(er_key)) |>
  st_drop_geometry() |>
  group_by(worms_id, er_key) |>
  summarise(
    n_occ_hex = n(),
    .groups = "drop"
  ) # 45 rows

# sequentially more restrective filters
# filter:  0 >  1 >  2 >  3 >  4
# n_rows: 75 > 69 > 62 > 55 > 45

d_spp_er_occ <- d_spp_er |>
  select(-sp_cmn) |>
  full_join(
    d_filt0 |>
      rename(n_occ_hex_f0 = n_occ_hex),
    by = c("worms_id", "er_key")
  ) |>
  full_join(
    d_filt1 |>
      rename(n_occ_hex_f1 = n_occ_hex),
    by = c("worms_id", "er_key")
  ) |>
  full_join(
    d_filt2 |>
      rename(n_occ_hex_f2 = n_occ_hex),
    by = c("worms_id", "er_key")
  ) |>
  full_join(
    d_filt3 |>
      rename(n_occ_hex_f3 = n_occ_hex),
    by = c("worms_id", "er_key")
  ) |>
  full_join(
    d_filt4 |>
      rename(n_occ_hex_f4 = n_occ_hex),
    by = c("worms_id", "er_key")
  ) |>
  left_join(
    d_spp15 |>
      select(sp_cmn, worms_id),
    by = "worms_id"
  ) |>
  relocate(sp_cmn, worms_id, er_key) |>
  arrange(sp_cmn, er_key)

datatable(
  d_spp_er_occ,
  caption = "Species counts in Ecoregions for SDM vs occurrenced data"
)

1.8 Compare Ecoregions from SDM vs Occurrence Data

Code
d_spp_er_occ_lgl <- d_spp_er_occ |>
  mutate(
    has_sdm = !is.na(n_sdm_cell),
    has_occ_f0 = !is.na(n_occ_hex_f0),
    has_occ_f1 = !is.na(n_occ_hex_f1),
    has_occ_f2 = !is.na(n_occ_hex_f2),
    has_occ_f3 = !is.na(n_occ_hex_f3),
    has_occ_f4 = !is.na(n_occ_hex_f4)
  ) |>
  select(-starts_with("n_"))

# color code logicals with red (F) and green (T) and replace with icons
d_spp_er_occ_lgl |>
  # mutate(
  #   sp_cmn_id = glue("{sp_cmn} ({worms_id})")
  # ) |>
  # relocate(sp_cmn_id) |>
  # select(-sp_cmn, -worms_id) |>
  select(-worms_id) |>
  datatable(
    caption = "Species counts in Ecoregions from SDM vs Occurrence data as logical values",
  ) |>
  formatStyle(
    columns = c(
      "has_sdm",
      "has_occ_f0",
      "has_occ_f1",
      "has_occ_f2",
      "has_occ_f3",
      "has_occ_f4"
    ),
    target = "cell",
    backgroundColor = styleEqual(
      c(TRUE, FALSE),
      c("lightgreen", "lightcoral")
    ),
    color = styleEqual(
      c(TRUE, FALSE),
      c("darkgreen", "darkred")
    )
  )
Code
# per sp_cmn and filter, tally # ecoregions in sdm vs occ that: match, miss, extra
d_spp_er_smry <- d_spp_er_occ_lgl |>
  pivot_longer(
    cols = starts_with("has_occ_"),
    names_to = "filter",
    names_prefix = "has_occ_",
    values_to = "has_occ"
  ) |>
  filter(
    has_sdm | has_occ
  ) |>
  mutate(
    match_type = case_when(
      has_sdm & has_occ ~ "n_er_equal",
      has_sdm & !has_occ ~ "n_er_miss",
      !has_sdm & has_occ ~ "n_er_add",
      TRUE ~ "none"
    )
  ) |>
  group_by(sp_cmn, filter, match_type) |>
  summarise(
    n_ecoregions = n(),
    .groups = "drop"
  ) |>
  pivot_wider(
    names_from = match_type,
    values_from = n_ecoregions,
    values_fill = 0
  ) |>
  relocate(n_er_equal, n_er_miss, n_er_add, .after = filter) |>
  arrange(sp_cmn, filter)

datatable(
  d_spp_er_smry,
  caption = "Summary of Ecoregion matches between SDM and Occurrence data per species and filter"
)
Code
# determine which filter maximizes n_er_equal while minimizing n_er_miss and n_er_add
d_spp_er_best <- d_spp_er_smry |>
  group_by(sp_cmn) |>
  slice_max(
    order_by = n_er_equal,
    with_ties = T
  ) |>
  slice_min(
    order_by = n_er_miss,
    with_ties = T
  ) |>
  slice_min(
    order_by = n_er_add,
    with_ties = T
  ) |>
  ungroup() |>
  arrange(sp_cmn)

datatable(
  d_spp_er_best,
  caption = "Summary of Best Ecoregion matches between SDM and Occurrence data per species"
)