1 SDM Validation per Ecoregion by Occurrence

1.1 Overview

Get species of interest from the Marine Sensitivity Database and validate ecoregions using GBIF and WoRMS occurence data.

Process:

Get raster for species of interest from Marine Sensitivity Database
Get occurrence data from GBIF and WoRMS for those species
Compare presence of species in Ecoregions for SDM vs occurrence data using different filters
Determine the best filter to systematically apply that maximizes matches between SDM and occurrence data, while minimizing mismatches (additions and subtractions)

TODO:

Buffer 10 km to include Rice’s whale occurrences

1.2 Species of Interest

Walrus, North Atlantic Right Whale, Humpback Whale, Fin Whale, Atlantic Sturgeon, Grey Seal, Atlantic Cod, Common Dolphin, Atlantic Lobster, Bowhead Whale, Black-capped Petrel. Common Tern, Calanus finmarchicus, Meganyctiphanes norvegica, Rice’s Whale

Code

librarian::shelf(
  aws.s3,
  DBI,
  DT,
  dplyr,
  duckdb,
  glue,
  h3jsr,
  here,
  jsonlite,
  mapgl,
  mapview,
  purrr,
  readr,
  rgbif,
  sf,
  stringr,
  terra,
  tidyr,
  worrms,
  quiet = T
)

Warning: package 'mapgl' was built under R version 4.5.2

Code

verbose <- T
is_server <- Sys.info()[["sysname"]] == "Linux"
dir_private <- ifelse(
  is_server,
  "/share/private",
  "~/My Drive/private"
)
dir_data <- ifelse(
  is_server,
  "/share/data",
  "~/My Drive/projects/msens/data"
)
mapbox_tkn_txt <- glue("{dir_private}/mapbox_token_bdbest.txt")
maptile_tkn_txt <- glue("{dir_private}/mapbox_token_bdbest.txt")
cell_tif <- glue("{dir_data}/derived/r_bio-oracle_planarea.tif")
sdm_db <- glue("{dir_data}/derived/sdm.duckdb")
spp15_csv <- here("data/sdm_occ_spp15.csv")
er_gpkg <- glue("{dir_data}/derived/downloads/ply_ecoregions_2025.gpkg")
h3_rds <- here("data/sdm_occ_validation/worms_h3_cells.rds")

# db ----
con_sdm <- dbConnect(duckdb(), dbdir = sdm_db, read_only = T)

# data prep ----
r_cell <- rast(cell_tif)

get_mdl_rast <- function(mdl_seq) {
  # mdl_seq = 3

  d <- tbl(con_sdm, "model_cell") |>
    filter(mdl_seq == !!mdl_seq) |>
    select(cell_id, value) |>
    collect()

  r <- init(r_cell[[1]], NA)
  r[d$cell_id] <- d$value
  names(r) <- "value"

  r
}

get_mdl_ecoregions <- function(mdl_seq) {
  # mdl_seq = 3
  tbl(con_sdm, "model_cell") |>
    filter(
      mdl_seq == !!mdl_seq,
      !is.na(value),
      value > 0
    ) |>
    select(cell_id) |>
    left_join(
      tbl(con_sdm, "zone_cell") |>
        select(cell_id, zone_seq),
      by = join_by(cell_id)
    ) |>
    left_join(
      tbl(con_sdm, "zone") |>
        filter(tbl == "ply_ecoregions_2025") |>
        select(zone_seq, er_key = value),
      by = join_by(zone_seq)
    ) |>
    filter(!is.na(er_key)) |>
    group_by(er_key) |>
    summarise(n_sdm_cell = n()) |>
    collect()
}

Code

if (!file.exists(spp15_csv)) {
  d_spp15 <- tribble(
    ~sp_sci                     , ~sp_cmn                      ,
    "Odobenus rosmarus"         , "Walrus"                     ,
    "Eubalaena glacialis"       , "North Atlantic Right Whale" ,
    "Megaptera novaeangliae"    , "Humpback Whale"             ,
    "Balaenoptera physalus"     , "Fin Whale"                  ,
    "Acipenser oxyrinchus"      , "Atlantic Sturgeon"          ,
    "Halichoerus grypus"        , "Grey Seal"                  ,
    "Gadus morhua"              , "Atlantic Cod"               ,
    "Delphinus delphis"         , "Common Dolphin"             ,
    "Homarus americanus"        , "Atlantic Lobster"           ,
    "Balaena mysticetus"        , "Bowhead Whale"              ,
    "Pterodroma hasitata"       , "Black-capped Petrel"        ,
    "Sterna hirundo"            , "Common Tern"                ,
    "Calanus finmarchicus"      , "Calanus finmarchicus"       ,
    "Meganyctiphanes norvegica" , "Meganyctiphanes norvegica"  ,
    "Balaenoptera ricei"        , "Rice's Whale"
  ) |>
    relocate(sp_cmn) |>
    arrange(sp_cmn) |>
    mutate(
      worms_id = map_int(sp_sci, \(x) {
        worrms::wm_name2id(x) |>
          as.integer()
      }),
      gbif_id = map_int(sp_sci, \(x) {
        rgbif::name_backbone(x) |>
          pull(usageKey) |>
          as.integer()
      }),
      mdl_seq = map_int(sp_sci, \(x) {
        tbl(con_sdm, "taxon") |>
          filter(is_ok) |>
          filter(scientific_name == x) |>
          pull(mdl_seq)
      })
    )
  write_csv(d_spp15, spp15_csv)
} else {
  d_spp15 <- read_csv(spp15_csv, show_col_types = F)
}

d_spp15 |>
  mutate(
    worms_id = glue(
      "<a href='https://www.marinespecies.org/aphia.php?p=taxdetails&id={worms_id}' target='_blank'>{worms_id}</a>"
    ),
    gbif_id = glue(
      "<a href='https://www.gbif.org/species/{gbif_id}' target='_blank'>{gbif_id}</a>"
    ),
    mdl_seq = glue(
      "<a href='https://shiny.marinesensitivity.org/mapsp/?mdl_seq={mdl_seq}' target='_blank'>{mdl_seq}</a>"
    )
  ) |>
  datatable(
    caption = "Species of interest with IDs from WoRMS (used by OBIS), GBIF and the MarineSensitivity models (`mdl_seq`)",
    escape = F
  )

1.3 Get Ecoregions from Marine Sensitivity Database

Code

ply_er <- read_sf(er_gpkg) |>
  rename(er_key = ecoregion_key)

d_spp_er <- d_spp15 |>
  mutate(
    ecoregions = map(mdl_seq, get_mdl_ecoregions)
  ) |>
  select(sp_cmn, worms_id, ecoregions) |>
  unnest(ecoregions) |>
  arrange(sp_cmn, er_key)

1.4 Get Occurrence Data from OBIS / WoRMS

iobis/speciesgrids: gridded datasets of WoRMS aligned marine species distributions as GeoParquet based on the OBIS and GBIF occurrence snapshots. The package currently supports Geohash and H3 grid output.

Code

# Set up duckdb connection and extensions
con <- dbConnect(duckdb())
dbSendQuery(con, "INSTALL httpfs; LOAD httpfs;")

<duckdb_result 3e4e0 connection=74790 statement='INSTALL httpfs; LOAD httpfs;'>

Code

dbSendQuery(con, "INSTALL spatial; LOAD spatial;")

<duckdb_result 945b0 connection=74790 statement='INSTALL spatial; LOAD spatial;'>

Code

dbSendQuery(con, "INSTALL h3 FROM community; LOAD h3;")

<duckdb_result 85dd0 connection=74790 statement='INSTALL h3 FROM community; LOAD h3;'>

Code

# Query
# species <- dbGetQuery(
#   con,
#   glue(
#     "
#   select kingdom, phylum, class, family, genus, species, AphiaID
#   from read_parquet('s3://obis-products/speciesgrids/h3_7/*')
#   where ST_Intersects(geometry, ST_GeomFromText('{wkt}'))
#   group by kingdom, phylum, class, family, genus, species, AphiaID
# "
#   )
# )

# d_op <- get_bucket_df(
#   's3://obis-products',
#   max = Inf
# )

# d_sg <- get_bucket_df(
#   's3://obis-products',
#   prefix = 'speciesgrids/',
#   max = Inf
# )
# which(d_sg$Key == "speedy/distribution_137077.geojson")

# pq <- "s3://obis-products/speciesgrids/h3_7/*"

# column_info <- dbGetQuery(
#   con,
#   glue(
#     "DESCRIBE SELECT * FROM read_parquet('{pq}');"
#   )
# )
# print(column_info)

# get all cells for species with AphiaID 156263
# worms_id <- 137077 # _Odobenus rosmarus_ (walrus)
# Fix the query to use h3_cell_to_boundary_wkt instead of h3_cell_to_boundary_wkb
# d_sp_h3 <- dbGetQuery(
#   con,
#   glue(
#     "select *, h3_cell_to_boundary_wkt(cell) AS geometry_wkt
#     from read_parquet('{pq}')
#     where AphiaID = {aphia_id}"
#   )
# )

for (i in 1:nrow(d_spp15)) {
  # i = 1
  worms_id <- d_spp15$worms_id[i]
  sp_cmn <- d_spp15$sp_cmn[i]

  h3_csv <- here(glue("data/sdm_occ_validation/h3_cells_worms-{worms_id}.csv"))

  if (!file.exists(h3_csv)) {
    message(glue(
      "Querying {i}/15: {sp_cmn} (worms_id: {worms_id}) ~ {Sys.Date()}"
    ))

    d_h3 <- dbGetQuery(
      con,
      glue(
        "SELECT cell AS h3_cell, records, min_year, max_year, source_obis, source_gbif
        from read_parquet('s3://obis-products/speciesgrids/h3_7/*') 
        where AphiaID = {worms_id}"
      )
    )
    write_csv(d_h3, h3_csv)
  }
}

if (!file.exists(h3_rds)) {
  ply_h3 <- tibble(
    csv = list.files(
      here("data/sdm_occ_validation"),
      pattern = "h3_cells_worms-.*\\.csv$",
      full.names = T
    )
  ) |>
    mutate(
      worms_id = str_extract(basename(csv), "(?<=worms-)[0-9]+") |>
        as.integer(),
      data = map(csv, read_csv, show_col_types = F),
    ) |>
    select(-csv) |>
    arrange(worms_id) |>
    unnest(data) |>
    mutate(
      geom = cell_to_polygon(h3_cell, simple = T)
    ) |>
    st_as_sf()

  write_rds(ply_h3, h3_rds)
}

ply_h3 <- read_rds(h3_rds)

1.5 Map of ecoregions with walrus occurrence h3 hexagons

Code

ply_h3_walrus <- ply_h3 |>
  filter(worms_id == 137077) |>
  mutate(
    popup = glue(
      "<strong>H3 Cell:</strong> {h3_cell}<br/>
      <strong>Records:</strong> {records}<br/>
      <strong>Min Year:</strong> {min_year}<br/>
      <strong>Max Year:</strong> {max_year}<br/>
      <strong>Source OBIS:</strong> {source_obis}<br/>
      <strong>Source GBIF:</strong> {source_gbif}"
    )
  )

maplibre(
  style = maptiler_style("bright", variant = "dark"),
  zoom = 3.5,
  center = c(-106, 40.1)
) |>
  add_vector_source(
    id = "er_src",
    url = "https://api.marinesensitivity.org/tilejson?table=public.ply_ecoregions_2025"
  ) |>
  add_line_layer(
    id = "er_ln",
    source = "er_src",
    source_layer = "public.ply_ecoregions_2025",
    line_color = "gray",
    line_opacity = 1,
    line_width = 3
  ) |>
  add_fill_layer(
    id = "walrus_h3",
    source = ply_h3_walrus,
    fill_color = "white",
    fill_opacity = 0.5,
    popup = "popup"
  ) |>
  add_fullscreen_control() |>
  add_navigation_control() |>
  add_scale_control() |>
  add_geocoder_control()

1.6 Map of ecoregions with Rice’s whale occurrence h3 hexagons

Code

ply_h3_ricei <- ply_h3 |>
  filter(worms_id == 1576133) |>
  mutate(
    popup = glue(
      "<strong>H3 Cell:</strong> {h3_cell}<br/>
      <strong>Records:</strong> {records}<br/>
      <strong>Min Year:</strong> {min_year}<br/>
      <strong>Max Year:</strong> {max_year}<br/>
      <strong>Source OBIS:</strong> {source_obis}<br/>
      <strong>Source GBIF:</strong> {source_gbif}"
    )
  )

maplibre(
  style = maptiler_style("bright", variant = "dark"),
  zoom = 3.5,
  center = c(-106, 40.1)
) |>
  add_vector_source(
    id = "er_src",
    url = "https://api.marinesensitivity.org/tilejson?table=public.ply_ecoregions_2025"
  ) |>
  add_line_layer(
    id = "er_ln",
    source = "er_src",
    source_layer = "public.ply_ecoregions_2025",
    line_color = "gray",
    line_opacity = 1,
    line_width = 3
  ) |>
  add_fill_layer(
    id = "ricei_h3",
    source = ply_h3_ricei,
    fill_color = "white",
    fill_opacity = 0.5,
    popup = "popup"
  ) |>
  add_fullscreen_control() |>
  add_navigation_control() |>
  add_scale_control() |>
  add_geocoder_control()

1.7 Apply Filters to Occurrence Data

Code

# apply
d_filt0 <- ply_h3 |>
  # unfiltered, regardless of source (obis/gbif) or year (even has min_year)
  select(worms_id, h3_cell) |>
  st_join(
    ply_er |>
      select(er_key)
  ) |>
  filter(!is.na(er_key)) |>
  st_drop_geometry() |>
  group_by(worms_id, er_key) |>
  summarise(
    n_occ_hex = n(),
    .groups = "drop"
  ) # 75 rows

d_filt1 <- ply_h3 |>
  # source_obis
  filter(
    source_obis == T
  ) |>
  select(worms_id, h3_cell) |>
  st_join(
    ply_er |>
      select(er_key)
  ) |>
  filter(!is.na(er_key)) |>
  st_drop_geometry() |>
  group_by(worms_id, er_key) |>
  summarise(
    n_occ_hex = n(),
    .groups = "drop"
  ) # 69 rows

d_filt2 <- ply_h3 |>
  # source_obis, has min_year data
  filter(
    source_obis == T,
    !is.na(min_year)
  ) |>
  select(worms_id, h3_cell) |>
  st_join(
    ply_er |>
      select(er_key)
  ) |>
  filter(!is.na(er_key)) |>
  st_drop_geometry() |>
  group_by(worms_id, er_key) |>
  summarise(
    n_occ_hex = n(),
    .groups = "drop"
  ) # 62 rows

d_filt3 <- ply_h3 |>
  # source_obis, min_year >= 1975 (last 50 years)
  filter(
    source_obis == T,
    !is.na(min_year),
    min_year >= 1975
  ) |>
  select(worms_id, h3_cell) |>
  st_join(
    ply_er |>
      select(er_key)
  ) |>
  filter(!is.na(er_key)) |>
  st_drop_geometry() |>
  group_by(worms_id, er_key) |>
  summarise(
    n_occ_hex = n(),
    .groups = "drop"
  ) # 55 rows

d_filt4 <- ply_h3 |>
  # source_obis, min_year >= 2000 (last 25 years)
  filter(
    source_obis == T,
    !is.na(min_year),
    min_year >= 2000
  ) |>
  select(worms_id, h3_cell) |>
  st_join(
    ply_er |>
      select(er_key)
  ) |>
  filter(!is.na(er_key)) |>
  st_drop_geometry() |>
  group_by(worms_id, er_key) |>
  summarise(
    n_occ_hex = n(),
    .groups = "drop"
  ) # 45 rows

# sequentially more restrective filters
# filter:  0 >  1 >  2 >  3 >  4
# n_rows: 75 > 69 > 62 > 55 > 45

d_spp_er_occ <- d_spp_er |>
  select(-sp_cmn) |>
  full_join(
    d_filt0 |>
      rename(n_occ_hex_f0 = n_occ_hex),
    by = c("worms_id", "er_key")
  ) |>
  full_join(
    d_filt1 |>
      rename(n_occ_hex_f1 = n_occ_hex),
    by = c("worms_id", "er_key")
  ) |>
  full_join(
    d_filt2 |>
      rename(n_occ_hex_f2 = n_occ_hex),
    by = c("worms_id", "er_key")
  ) |>
  full_join(
    d_filt3 |>
      rename(n_occ_hex_f3 = n_occ_hex),
    by = c("worms_id", "er_key")
  ) |>
  full_join(
    d_filt4 |>
      rename(n_occ_hex_f4 = n_occ_hex),
    by = c("worms_id", "er_key")
  ) |>
  left_join(
    d_spp15 |>
      select(sp_cmn, worms_id),
    by = "worms_id"
  ) |>
  relocate(sp_cmn, worms_id, er_key) |>
  arrange(sp_cmn, er_key)

datatable(
  d_spp_er_occ,
  caption = "Species counts in Ecoregions for SDM vs occurrenced data"
)

1.8 Compare Ecoregions from SDM vs Occurrence Data

Code

d_spp_er_occ_lgl <- d_spp_er_occ |>
  mutate(
    has_sdm = !is.na(n_sdm_cell),
    has_occ_f0 = !is.na(n_occ_hex_f0),
    has_occ_f1 = !is.na(n_occ_hex_f1),
    has_occ_f2 = !is.na(n_occ_hex_f2),
    has_occ_f3 = !is.na(n_occ_hex_f3),
    has_occ_f4 = !is.na(n_occ_hex_f4)
  ) |>
  select(-starts_with("n_"))

# color code logicals with red (F) and green (T) and replace with icons
d_spp_er_occ_lgl |>
  # mutate(
  #   sp_cmn_id = glue("{sp_cmn} ({worms_id})")
  # ) |>
  # relocate(sp_cmn_id) |>
  # select(-sp_cmn, -worms_id) |>
  select(-worms_id) |>
  datatable(
    caption = "Species counts in Ecoregions from SDM vs Occurrence data as logical values",
  ) |>
  formatStyle(
    columns = c(
      "has_sdm",
      "has_occ_f0",
      "has_occ_f1",
      "has_occ_f2",
      "has_occ_f3",
      "has_occ_f4"
    ),
    target = "cell",
    backgroundColor = styleEqual(
      c(TRUE, FALSE),
      c("lightgreen", "lightcoral")
    ),
    color = styleEqual(
      c(TRUE, FALSE),
      c("darkgreen", "darkred")
    )
  )

Code

# per sp_cmn and filter, tally # ecoregions in sdm vs occ that: match, miss, extra
d_spp_er_smry <- d_spp_er_occ_lgl |>
  pivot_longer(
    cols = starts_with("has_occ_"),
    names_to = "filter",
    names_prefix = "has_occ_",
    values_to = "has_occ"
  ) |>
  filter(
    has_sdm | has_occ
  ) |>
  mutate(
    match_type = case_when(
      has_sdm & has_occ ~ "n_er_equal",
      has_sdm & !has_occ ~ "n_er_miss",
      !has_sdm & has_occ ~ "n_er_add",
      TRUE ~ "none"
    )
  ) |>
  group_by(sp_cmn, filter, match_type) |>
  summarise(
    n_ecoregions = n(),
    .groups = "drop"
  ) |>
  pivot_wider(
    names_from = match_type,
    values_from = n_ecoregions,
    values_fill = 0
  ) |>
  relocate(n_er_equal, n_er_miss, n_er_add, .after = filter) |>
  arrange(sp_cmn, filter)

datatable(
  d_spp_er_smry,
  caption = "Summary of Ecoregion matches between SDM and Occurrence data per species and filter"
)

Code

# determine which filter maximizes n_er_equal while minimizing n_er_miss and n_er_add
d_spp_er_best <- d_spp_er_smry |>
  group_by(sp_cmn) |>
  slice_max(
    order_by = n_er_equal,
    with_ties = T
  ) |>
  slice_min(
    order_by = n_er_miss,
    with_ties = T
  ) |>
  slice_min(
    order_by = n_er_add,
    with_ties = T
  ) |>
  ungroup() |>
  arrange(sp_cmn)

datatable(
  d_spp_er_best,
  caption = "Summary of Best Ecoregion matches between SDM and Occurrence data per species"
)

# SDM Validation per Ecoregion by Occurrence ## Overview Get species of interest from the Marine Sensitivity Database and validate ecoregions using GBIF and WoRMS occurence data. Process: - Get raster for species of interest from Marine Sensitivity Database - Get occurrence data from GBIF and WoRMS for those species - Compare presence of species in Ecoregions for SDM vs occurrence data using different filters - Determine the best filter to systematically apply that maximizes matches between SDM and occurrence data, while minimizing mismatches (additions and subtractions) TODO: - Buffer 10 km to include Rice's whale occurrences ## Species of Interest Walrus, North Atlantic Right Whale, Humpback Whale, Fin Whale, Atlantic Sturgeon, Grey Seal, Atlantic Cod, Common Dolphin, Atlantic Lobster, Bowhead Whale, Black-capped Petrel. Common Tern, Calanus finmarchicus, Meganyctiphanes norvegica, Rice's Whale ```{r} #| label: setup librarian::shelf( aws.s3, DBI, DT, dplyr, duckdb, glue, h3jsr, here, jsonlite, mapgl, mapview, purrr, readr, rgbif, sf, stringr, terra, tidyr, worrms, quiet = T ) verbose <- T is_server <- Sys.info()[["sysname"]] == "Linux" dir_private <- ifelse( is_server, "/share/private", "~/My Drive/private" ) dir_data <- ifelse( is_server, "/share/data", "~/My Drive/projects/msens/data" ) mapbox_tkn_txt <- glue("{dir_private}/mapbox_token_bdbest.txt") maptile_tkn_txt <- glue("{dir_private}/mapbox_token_bdbest.txt") cell_tif <- glue("{dir_data}/derived/r_bio-oracle_planarea.tif") sdm_db <- glue("{dir_data}/derived/sdm.duckdb") spp15_csv <- here("data/sdm_occ_spp15.csv") er_gpkg <- glue("{dir_data}/derived/downloads/ply_ecoregions_2025.gpkg") h3_rds <- here("data/sdm_occ_validation/worms_h3_cells.rds") # db ---- con_sdm <- dbConnect(duckdb(), dbdir = sdm_db, read_only = T) # data prep ---- r_cell <- rast(cell_tif) get_mdl_rast <- function(mdl_seq) { # mdl_seq = 3 d <- tbl(con_sdm, "model_cell") |> filter(mdl_seq == !!mdl_seq) |> select(cell_id, value) |> collect() r <- init(r_cell[[1]], NA) r[d$cell_id] <- d$value names(r) <- "value" r } get_mdl_ecoregions <- function(mdl_seq) { # mdl_seq = 3 tbl(con_sdm, "model_cell") |> filter( mdl_seq == !!mdl_seq, !is.na(value), value > 0 ) |> select(cell_id) |> left_join( tbl(con_sdm, "zone_cell") |> select(cell_id, zone_seq), by = join_by(cell_id) ) |> left_join( tbl(con_sdm, "zone") |> filter(tbl == "ply_ecoregions_2025") |> select(zone_seq, er_key = value), by = join_by(zone_seq) ) |> filter(!is.na(er_key)) |> group_by(er_key) |> summarise(n_sdm_cell = n()) |> collect() } ``` ```{r} #| label: d_spp if (!file.exists(spp15_csv)) { d_spp15 <- tribble( ~sp_sci , ~sp_cmn , "Odobenus rosmarus" , "Walrus" , "Eubalaena glacialis" , "North Atlantic Right Whale" , "Megaptera novaeangliae" , "Humpback Whale" , "Balaenoptera physalus" , "Fin Whale" , "Acipenser oxyrinchus" , "Atlantic Sturgeon" , "Halichoerus grypus" , "Grey Seal" , "Gadus morhua" , "Atlantic Cod" , "Delphinus delphis" , "Common Dolphin" , "Homarus americanus" , "Atlantic Lobster" , "Balaena mysticetus" , "Bowhead Whale" , "Pterodroma hasitata" , "Black-capped Petrel" , "Sterna hirundo" , "Common Tern" , "Calanus finmarchicus" , "Calanus finmarchicus" , "Meganyctiphanes norvegica" , "Meganyctiphanes norvegica" , "Balaenoptera ricei" , "Rice's Whale" ) |> relocate(sp_cmn) |> arrange(sp_cmn) |> mutate( worms_id = map_int(sp_sci, \(x) { worrms::wm_name2id(x) |> as.integer() }), gbif_id = map_int(sp_sci, \(x) { rgbif::name_backbone(x) |> pull(usageKey) |> as.integer() }), mdl_seq = map_int(sp_sci, \(x) { tbl(con_sdm, "taxon") |> filter(is_ok) |> filter(scientific_name == x) |> pull(mdl_seq) }) ) write_csv(d_spp15, spp15_csv) } else { d_spp15 <- read_csv(spp15_csv, show_col_types = F) } d_spp15 |> mutate( worms_id = glue( "<a href='https://www.marinespecies.org/aphia.php?p=taxdetails&id={worms_id}' target='_blank'>{worms_id}</a>" ), gbif_id = glue( "<a href='https://www.gbif.org/species/{gbif_id}' target='_blank'>{gbif_id}</a>" ), mdl_seq = glue( "<a href='https://shiny.marinesensitivity.org/mapsp/?mdl_seq={mdl_seq}' target='_blank'>{mdl_seq}</a>" ) ) |> datatable( caption = "Species of interest with IDs from WoRMS (used by OBIS), GBIF and the MarineSensitivity models (`mdl_seq`)", escape = F ) ``` ## Get Ecoregions from Marine Sensitivity Database ```{r} #| label: ecoregions ply_er <- read_sf(er_gpkg) |> rename(er_key = ecoregion_key) d_spp_er <- d_spp15 |> mutate( ecoregions = map(mdl_seq, get_mdl_ecoregions) ) |> select(sp_cmn, worms_id, ecoregions) |> unnest(ecoregions) |> arrange(sp_cmn, er_key) ``` ## Get Occurrence Data from OBIS / WoRMS - [iobis/speciesgrids](https://github.com/iobis/speciesgrids): gridded datasets of WoRMS aligned marine species distributions as GeoParquet based on the OBIS and GBIF occurrence snapshots. The package currently supports Geohash and H3 grid output. ```{r} #| label: other # Set up duckdb connection and extensions con <- dbConnect(duckdb()) dbSendQuery(con, "INSTALL httpfs; LOAD httpfs;") dbSendQuery(con, "INSTALL spatial; LOAD spatial;") dbSendQuery(con, "INSTALL h3 FROM community; LOAD h3;") # Query # species <- dbGetQuery( # con, # glue( # " # select kingdom, phylum, class, family, genus, species, AphiaID # from read_parquet('s3://obis-products/speciesgrids/h3_7/*') # where ST_Intersects(geometry, ST_GeomFromText('{wkt}')) # group by kingdom, phylum, class, family, genus, species, AphiaID # " # ) # ) # d_op <- get_bucket_df( # 's3://obis-products', # max = Inf # ) # d_sg <- get_bucket_df( # 's3://obis-products', # prefix = 'speciesgrids/', # max = Inf # ) # which(d_sg$Key == "speedy/distribution_137077.geojson") # pq <- "s3://obis-products/speciesgrids/h3_7/*" # column_info <- dbGetQuery( # con, # glue( # "DESCRIBE SELECT * FROM read_parquet('{pq}');" # ) # ) # print(column_info) # get all cells for species with AphiaID 156263 # worms_id <- 137077 # _Odobenus rosmarus_ (walrus) # Fix the query to use h3_cell_to_boundary_wkt instead of h3_cell_to_boundary_wkb # d_sp_h3 <- dbGetQuery( # con, # glue( # "select *, h3_cell_to_boundary_wkt(cell) AS geometry_wkt # from read_parquet('{pq}') # where AphiaID = {aphia_id}" # ) # ) for (i in 1:nrow(d_spp15)) { # i = 1 worms_id <- d_spp15$worms_id[i] sp_cmn <- d_spp15$sp_cmn[i] h3_csv <- here(glue("data/sdm_occ_validation/h3_cells_worms-{worms_id}.csv")) if (!file.exists(h3_csv)) { message(glue( "Querying {i}/15: {sp_cmn} (worms_id: {worms_id}) ~ {Sys.Date()}" )) d_h3 <- dbGetQuery( con, glue( "SELECT cell AS h3_cell, records, min_year, max_year, source_obis, source_gbif from read_parquet('s3://obis-products/speciesgrids/h3_7/*') where AphiaID = {worms_id}" ) ) write_csv(d_h3, h3_csv) } } if (!file.exists(h3_rds)) { ply_h3 <- tibble( csv = list.files( here("data/sdm_occ_validation"), pattern = "h3_cells_worms-.*\\.csv$", full.names = T ) ) |> mutate( worms_id = str_extract(basename(csv), "(?<=worms-)[0-9]+") |> as.integer(), data = map(csv, read_csv, show_col_types = F), ) |> select(-csv) |> arrange(worms_id) |> unnest(data) |> mutate( geom = cell_to_polygon(h3_cell, simple = T) ) |> st_as_sf() write_rds(ply_h3, h3_rds) } ply_h3 <- read_rds(h3_rds) ``` ## Map of ecoregions with **walrus** occurrence h3 hexagons ```{r} #| label: map_walrus ply_h3_walrus <- ply_h3 |> filter(worms_id == 137077) |> mutate( popup = glue( "H3 Cell: {h3_cell} Records: {records} Min Year: {min_year} Max Year: {max_year} Source OBIS: {source_obis} Source GBIF: {source_gbif}" ) ) maplibre( style = maptiler_style("bright", variant = "dark"), zoom = 3.5, center = c(-106, 40.1) ) |> add_vector_source( id = "er_src", url = "https://api.marinesensitivity.org/tilejson?table=public.ply_ecoregions_2025" ) |> add_line_layer( id = "er_ln", source = "er_src", source_layer = "public.ply_ecoregions_2025", line_color = "gray", line_opacity = 1, line_width = 3 ) |> add_fill_layer( id = "walrus_h3", source = ply_h3_walrus, fill_color = "white", fill_opacity = 0.5, popup = "popup" ) |> add_fullscreen_control() |> add_navigation_control() |> add_scale_control() |> add_geocoder_control() ``` ## Map of ecoregions with **Rice's whale** occurrence h3 hexagons ```{r} #| label: map_ricei ply_h3_ricei <- ply_h3 |> filter(worms_id == 1576133) |> mutate( popup = glue( "H3 Cell: {h3_cell} Records: {records} Min Year: {min_year} Max Year: {max_year} Source OBIS: {source_obis} Source GBIF: {source_gbif}" ) ) maplibre( style = maptiler_style("bright", variant = "dark"), zoom = 3.5, center = c(-106, 40.1) ) |> add_vector_source( id = "er_src", url = "https://api.marinesensitivity.org/tilejson?table=public.ply_ecoregions_2025" ) |> add_line_layer( id = "er_ln", source = "er_src", source_layer = "public.ply_ecoregions_2025", line_color = "gray", line_opacity = 1, line_width = 3 ) |> add_fill_layer( id = "ricei_h3", source = ply_h3_ricei, fill_color = "white", fill_opacity = 0.5, popup = "popup" ) |> add_fullscreen_control() |> add_navigation_control() |> add_scale_control() |> add_geocoder_control() ``` ## Apply Filters to Occurrence Data ```{r} #| label: filters to d_spp_er_occ # apply d_filt0 <- ply_h3 |> # unfiltered, regardless of source (obis/gbif) or year (even has min_year) select(worms_id, h3_cell) |> st_join( ply_er |> select(er_key) ) |> filter(!is.na(er_key)) |> st_drop_geometry() |> group_by(worms_id, er_key) |> summarise( n_occ_hex = n(), .groups = "drop" ) # 75 rows d_filt1 <- ply_h3 |> # source_obis filter( source_obis == T ) |> select(worms_id, h3_cell) |> st_join( ply_er |> select(er_key) ) |> filter(!is.na(er_key)) |> st_drop_geometry() |> group_by(worms_id, er_key) |> summarise( n_occ_hex = n(), .groups = "drop" ) # 69 rows d_filt2 <- ply_h3 |> # source_obis, has min_year data filter( source_obis == T, !is.na(min_year) ) |> select(worms_id, h3_cell) |> st_join( ply_er |> select(er_key) ) |> filter(!is.na(er_key)) |> st_drop_geometry() |> group_by(worms_id, er_key) |> summarise( n_occ_hex = n(), .groups = "drop" ) # 62 rows d_filt3 <- ply_h3 |> # source_obis, min_year >= 1975 (last 50 years) filter( source_obis == T, !is.na(min_year), min_year >= 1975 ) |> select(worms_id, h3_cell) |> st_join( ply_er |> select(er_key) ) |> filter(!is.na(er_key)) |> st_drop_geometry() |> group_by(worms_id, er_key) |> summarise( n_occ_hex = n(), .groups = "drop" ) # 55 rows d_filt4 <- ply_h3 |> # source_obis, min_year >= 2000 (last 25 years) filter( source_obis == T, !is.na(min_year), min_year >= 2000 ) |> select(worms_id, h3_cell) |> st_join( ply_er |> select(er_key) ) |> filter(!is.na(er_key)) |> st_drop_geometry() |> group_by(worms_id, er_key) |> summarise( n_occ_hex = n(), .groups = "drop" ) # 45 rows # sequentially more restrective filters # filter: 0 > 1 > 2 > 3 > 4 # n_rows: 75 > 69 > 62 > 55 > 45 d_spp_er_occ <- d_spp_er |> select(-sp_cmn) |> full_join( d_filt0 |> rename(n_occ_hex_f0 = n_occ_hex), by = c("worms_id", "er_key") ) |> full_join( d_filt1 |> rename(n_occ_hex_f1 = n_occ_hex), by = c("worms_id", "er_key") ) |> full_join( d_filt2 |> rename(n_occ_hex_f2 = n_occ_hex), by = c("worms_id", "er_key") ) |> full_join( d_filt3 |> rename(n_occ_hex_f3 = n_occ_hex), by = c("worms_id", "er_key") ) |> full_join( d_filt4 |> rename(n_occ_hex_f4 = n_occ_hex), by = c("worms_id", "er_key") ) |> left_join( d_spp15 |> select(sp_cmn, worms_id), by = "worms_id" ) |> relocate(sp_cmn, worms_id, er_key) |> arrange(sp_cmn, er_key) datatable( d_spp_er_occ, caption = "Species counts in Ecoregions for SDM vs occurrenced data" ) ``` ## Compare Ecoregions from SDM vs Occurrence Data ```{r} #| label: d_spp_er_occ_lgl to d_spp_er_best d_spp_er_occ_lgl <- d_spp_er_occ |> mutate( has_sdm = !is.na(n_sdm_cell), has_occ_f0 = !is.na(n_occ_hex_f0), has_occ_f1 = !is.na(n_occ_hex_f1), has_occ_f2 = !is.na(n_occ_hex_f2), has_occ_f3 = !is.na(n_occ_hex_f3), has_occ_f4 = !is.na(n_occ_hex_f4) ) |> select(-starts_with("n_")) # color code logicals with red (F) and green (T) and replace with icons d_spp_er_occ_lgl |> # mutate( # sp_cmn_id = glue("{sp_cmn} ({worms_id})") # ) |> # relocate(sp_cmn_id) |> # select(-sp_cmn, -worms_id) |> select(-worms_id) |> datatable( caption = "Species counts in Ecoregions from SDM vs Occurrence data as logical values", ) |> formatStyle( columns = c( "has_sdm", "has_occ_f0", "has_occ_f1", "has_occ_f2", "has_occ_f3", "has_occ_f4" ), target = "cell", backgroundColor = styleEqual( c(TRUE, FALSE), c("lightgreen", "lightcoral") ), color = styleEqual( c(TRUE, FALSE), c("darkgreen", "darkred") ) ) # per sp_cmn and filter, tally # ecoregions in sdm vs occ that: match, miss, extra d_spp_er_smry <- d_spp_er_occ_lgl |> pivot_longer( cols = starts_with("has_occ_"), names_to = "filter", names_prefix = "has_occ_", values_to = "has_occ" ) |> filter( has_sdm | has_occ ) |> mutate( match_type = case_when( has_sdm & has_occ ~ "n_er_equal", has_sdm & !has_occ ~ "n_er_miss", !has_sdm & has_occ ~ "n_er_add", TRUE ~ "none" ) ) |> group_by(sp_cmn, filter, match_type) |> summarise( n_ecoregions = n(), .groups = "drop" ) |> pivot_wider( names_from = match_type, values_from = n_ecoregions, values_fill = 0 ) |> relocate(n_er_equal, n_er_miss, n_er_add, .after = filter) |> arrange(sp_cmn, filter) datatable( d_spp_er_smry, caption = "Summary of Ecoregion matches between SDM and Occurrence data per species and filter" ) # determine which filter maximizes n_er_equal while minimizing n_er_miss and n_er_add d_spp_er_best <- d_spp_er_smry |> group_by(sp_cmn) |> slice_max( order_by = n_er_equal, with_ties = T ) |> slice_min( order_by = n_er_miss, with_ties = T ) |> slice_min( order_by = n_er_add, with_ties = T ) |> ungroup() |> arrange(sp_cmn) datatable( d_spp_er_best, caption = "Summary of Best Ecoregion matches between SDM and Occurrence data per species" ) ```