# SDM Validation per Ecoregion by Occurrence
## Overview
Get species of interest from the Marine Sensitivity Database and validate ecoregions using GBIF and WoRMS occurence data.
Process:
- Get raster for species of interest from Marine Sensitivity Database
- Get occurrence data from GBIF and WoRMS for those species
- Compare presence of species in Ecoregions for SDM vs occurrence data using different filters
- Determine the best filter to systematically apply that maximizes matches between SDM and occurrence data, while minimizing mismatches (additions and subtractions)
TODO:
- Buffer 10 km to include Rice's whale occurrences
## Species of Interest
Walrus, North Atlantic Right Whale, Humpback Whale, Fin Whale, Atlantic Sturgeon, Grey Seal, Atlantic Cod, Common Dolphin, Atlantic Lobster, Bowhead Whale, Black-capped Petrel. Common Tern, Calanus finmarchicus, Meganyctiphanes norvegica, Rice's Whale
```{r}
#| label: setup
librarian::shelf(
aws.s3,
DBI,
DT,
dplyr,
duckdb,
glue,
h3jsr,
here,
jsonlite,
mapgl,
mapview,
purrr,
readr,
rgbif,
sf,
stringr,
terra,
tidyr,
worrms,
quiet = T
)
verbose <- T
is_server <- Sys.info()[["sysname"]] == "Linux"
dir_private <- ifelse(
is_server,
"/share/private",
"~/My Drive/private"
)
dir_data <- ifelse(
is_server,
"/share/data",
"~/My Drive/projects/msens/data"
)
mapbox_tkn_txt <- glue("{dir_private}/mapbox_token_bdbest.txt")
maptile_tkn_txt <- glue("{dir_private}/mapbox_token_bdbest.txt")
cell_tif <- glue("{dir_data}/derived/r_bio-oracle_planarea.tif")
sdm_db <- glue("{dir_data}/derived/sdm.duckdb")
spp15_csv <- here("data/sdm_occ_spp15.csv")
er_gpkg <- glue("{dir_data}/derived/downloads/ply_ecoregions_2025.gpkg")
h3_rds <- here("data/sdm_occ_validation/worms_h3_cells.rds")
# db ----
con_sdm <- dbConnect(duckdb(), dbdir = sdm_db, read_only = T)
# data prep ----
r_cell <- rast(cell_tif)
get_mdl_rast <- function(mdl_seq) {
# mdl_seq = 3
d <- tbl(con_sdm, "model_cell") |>
filter(mdl_seq == !!mdl_seq) |>
select(cell_id, value) |>
collect()
r <- init(r_cell[[1]], NA)
r[d$cell_id] <- d$value
names(r) <- "value"
r
}
get_mdl_ecoregions <- function(mdl_seq) {
# mdl_seq = 3
tbl(con_sdm, "model_cell") |>
filter(
mdl_seq == !!mdl_seq,
!is.na(value),
value > 0
) |>
select(cell_id) |>
left_join(
tbl(con_sdm, "zone_cell") |>
select(cell_id, zone_seq),
by = join_by(cell_id)
) |>
left_join(
tbl(con_sdm, "zone") |>
filter(tbl == "ply_ecoregions_2025") |>
select(zone_seq, er_key = value),
by = join_by(zone_seq)
) |>
filter(!is.na(er_key)) |>
group_by(er_key) |>
summarise(n_sdm_cell = n()) |>
collect()
}
```
```{r}
#| label: d_spp
if (!file.exists(spp15_csv)) {
d_spp15 <- tribble(
~sp_sci , ~sp_cmn ,
"Odobenus rosmarus" , "Walrus" ,
"Eubalaena glacialis" , "North Atlantic Right Whale" ,
"Megaptera novaeangliae" , "Humpback Whale" ,
"Balaenoptera physalus" , "Fin Whale" ,
"Acipenser oxyrinchus" , "Atlantic Sturgeon" ,
"Halichoerus grypus" , "Grey Seal" ,
"Gadus morhua" , "Atlantic Cod" ,
"Delphinus delphis" , "Common Dolphin" ,
"Homarus americanus" , "Atlantic Lobster" ,
"Balaena mysticetus" , "Bowhead Whale" ,
"Pterodroma hasitata" , "Black-capped Petrel" ,
"Sterna hirundo" , "Common Tern" ,
"Calanus finmarchicus" , "Calanus finmarchicus" ,
"Meganyctiphanes norvegica" , "Meganyctiphanes norvegica" ,
"Balaenoptera ricei" , "Rice's Whale"
) |>
relocate(sp_cmn) |>
arrange(sp_cmn) |>
mutate(
worms_id = map_int(sp_sci, \(x) {
worrms::wm_name2id(x) |>
as.integer()
}),
gbif_id = map_int(sp_sci, \(x) {
rgbif::name_backbone(x) |>
pull(usageKey) |>
as.integer()
}),
mdl_seq = map_int(sp_sci, \(x) {
tbl(con_sdm, "taxon") |>
filter(is_ok) |>
filter(scientific_name == x) |>
pull(mdl_seq)
})
)
write_csv(d_spp15, spp15_csv)
} else {
d_spp15 <- read_csv(spp15_csv, show_col_types = F)
}
d_spp15 |>
mutate(
worms_id = glue(
"<a href='https://www.marinespecies.org/aphia.php?p=taxdetails&id={worms_id}' target='_blank'>{worms_id}</a>"
),
gbif_id = glue(
"<a href='https://www.gbif.org/species/{gbif_id}' target='_blank'>{gbif_id}</a>"
),
mdl_seq = glue(
"<a href='https://shiny.marinesensitivity.org/mapsp/?mdl_seq={mdl_seq}' target='_blank'>{mdl_seq}</a>"
)
) |>
datatable(
caption = "Species of interest with IDs from WoRMS (used by OBIS), GBIF and the MarineSensitivity models (`mdl_seq`)",
escape = F
)
```
## Get Ecoregions from Marine Sensitivity Database
```{r}
#| label: ecoregions
ply_er <- read_sf(er_gpkg) |>
rename(er_key = ecoregion_key)
d_spp_er <- d_spp15 |>
mutate(
ecoregions = map(mdl_seq, get_mdl_ecoregions)
) |>
select(sp_cmn, worms_id, ecoregions) |>
unnest(ecoregions) |>
arrange(sp_cmn, er_key)
```
## Get Occurrence Data from OBIS / WoRMS
- [iobis/speciesgrids](https://github.com/iobis/speciesgrids): gridded datasets of WoRMS aligned marine species distributions as GeoParquet based on the OBIS and GBIF occurrence snapshots. The package currently supports Geohash and H3 grid output.
```{r}
#| label: other
# Set up duckdb connection and extensions
con <- dbConnect(duckdb())
dbSendQuery(con, "INSTALL httpfs; LOAD httpfs;")
dbSendQuery(con, "INSTALL spatial; LOAD spatial;")
dbSendQuery(con, "INSTALL h3 FROM community; LOAD h3;")
# Query
# species <- dbGetQuery(
# con,
# glue(
# "
# select kingdom, phylum, class, family, genus, species, AphiaID
# from read_parquet('s3://obis-products/speciesgrids/h3_7/*')
# where ST_Intersects(geometry, ST_GeomFromText('{wkt}'))
# group by kingdom, phylum, class, family, genus, species, AphiaID
# "
# )
# )
# d_op <- get_bucket_df(
# 's3://obis-products',
# max = Inf
# )
# d_sg <- get_bucket_df(
# 's3://obis-products',
# prefix = 'speciesgrids/',
# max = Inf
# )
# which(d_sg$Key == "speedy/distribution_137077.geojson")
# pq <- "s3://obis-products/speciesgrids/h3_7/*"
# column_info <- dbGetQuery(
# con,
# glue(
# "DESCRIBE SELECT * FROM read_parquet('{pq}');"
# )
# )
# print(column_info)
# get all cells for species with AphiaID 156263
# worms_id <- 137077 # _Odobenus rosmarus_ (walrus)
# Fix the query to use h3_cell_to_boundary_wkt instead of h3_cell_to_boundary_wkb
# d_sp_h3 <- dbGetQuery(
# con,
# glue(
# "select *, h3_cell_to_boundary_wkt(cell) AS geometry_wkt
# from read_parquet('{pq}')
# where AphiaID = {aphia_id}"
# )
# )
for (i in 1:nrow(d_spp15)) {
# i = 1
worms_id <- d_spp15$worms_id[i]
sp_cmn <- d_spp15$sp_cmn[i]
h3_csv <- here(glue("data/sdm_occ_validation/h3_cells_worms-{worms_id}.csv"))
if (!file.exists(h3_csv)) {
message(glue(
"Querying {i}/15: {sp_cmn} (worms_id: {worms_id}) ~ {Sys.Date()}"
))
d_h3 <- dbGetQuery(
con,
glue(
"SELECT cell AS h3_cell, records, min_year, max_year, source_obis, source_gbif
from read_parquet('s3://obis-products/speciesgrids/h3_7/*')
where AphiaID = {worms_id}"
)
)
write_csv(d_h3, h3_csv)
}
}
if (!file.exists(h3_rds)) {
ply_h3 <- tibble(
csv = list.files(
here("data/sdm_occ_validation"),
pattern = "h3_cells_worms-.*\\.csv$",
full.names = T
)
) |>
mutate(
worms_id = str_extract(basename(csv), "(?<=worms-)[0-9]+") |>
as.integer(),
data = map(csv, read_csv, show_col_types = F),
) |>
select(-csv) |>
arrange(worms_id) |>
unnest(data) |>
mutate(
geom = cell_to_polygon(h3_cell, simple = T)
) |>
st_as_sf()
write_rds(ply_h3, h3_rds)
}
ply_h3 <- read_rds(h3_rds)
```
## Map of ecoregions with **walrus** occurrence h3 hexagons
```{r}
#| label: map_walrus
ply_h3_walrus <- ply_h3 |>
filter(worms_id == 137077) |>
mutate(
popup = glue(
"<strong>H3 Cell:</strong> {h3_cell}<br/>
<strong>Records:</strong> {records}<br/>
<strong>Min Year:</strong> {min_year}<br/>
<strong>Max Year:</strong> {max_year}<br/>
<strong>Source OBIS:</strong> {source_obis}<br/>
<strong>Source GBIF:</strong> {source_gbif}"
)
)
maplibre(
style = maptiler_style("bright", variant = "dark"),
zoom = 3.5,
center = c(-106, 40.1)
) |>
add_vector_source(
id = "er_src",
url = "https://api.marinesensitivity.org/tilejson?table=public.ply_ecoregions_2025"
) |>
add_line_layer(
id = "er_ln",
source = "er_src",
source_layer = "public.ply_ecoregions_2025",
line_color = "gray",
line_opacity = 1,
line_width = 3
) |>
add_fill_layer(
id = "walrus_h3",
source = ply_h3_walrus,
fill_color = "white",
fill_opacity = 0.5,
popup = "popup"
) |>
add_fullscreen_control() |>
add_navigation_control() |>
add_scale_control() |>
add_geocoder_control()
```
## Map of ecoregions with **Rice's whale** occurrence h3 hexagons
```{r}
#| label: map_ricei
ply_h3_ricei <- ply_h3 |>
filter(worms_id == 1576133) |>
mutate(
popup = glue(
"<strong>H3 Cell:</strong> {h3_cell}<br/>
<strong>Records:</strong> {records}<br/>
<strong>Min Year:</strong> {min_year}<br/>
<strong>Max Year:</strong> {max_year}<br/>
<strong>Source OBIS:</strong> {source_obis}<br/>
<strong>Source GBIF:</strong> {source_gbif}"
)
)
maplibre(
style = maptiler_style("bright", variant = "dark"),
zoom = 3.5,
center = c(-106, 40.1)
) |>
add_vector_source(
id = "er_src",
url = "https://api.marinesensitivity.org/tilejson?table=public.ply_ecoregions_2025"
) |>
add_line_layer(
id = "er_ln",
source = "er_src",
source_layer = "public.ply_ecoregions_2025",
line_color = "gray",
line_opacity = 1,
line_width = 3
) |>
add_fill_layer(
id = "ricei_h3",
source = ply_h3_ricei,
fill_color = "white",
fill_opacity = 0.5,
popup = "popup"
) |>
add_fullscreen_control() |>
add_navigation_control() |>
add_scale_control() |>
add_geocoder_control()
```
## Apply Filters to Occurrence Data
```{r}
#| label: filters to d_spp_er_occ
# apply
d_filt0 <- ply_h3 |>
# unfiltered, regardless of source (obis/gbif) or year (even has min_year)
select(worms_id, h3_cell) |>
st_join(
ply_er |>
select(er_key)
) |>
filter(!is.na(er_key)) |>
st_drop_geometry() |>
group_by(worms_id, er_key) |>
summarise(
n_occ_hex = n(),
.groups = "drop"
) # 75 rows
d_filt1 <- ply_h3 |>
# source_obis
filter(
source_obis == T
) |>
select(worms_id, h3_cell) |>
st_join(
ply_er |>
select(er_key)
) |>
filter(!is.na(er_key)) |>
st_drop_geometry() |>
group_by(worms_id, er_key) |>
summarise(
n_occ_hex = n(),
.groups = "drop"
) # 69 rows
d_filt2 <- ply_h3 |>
# source_obis, has min_year data
filter(
source_obis == T,
!is.na(min_year)
) |>
select(worms_id, h3_cell) |>
st_join(
ply_er |>
select(er_key)
) |>
filter(!is.na(er_key)) |>
st_drop_geometry() |>
group_by(worms_id, er_key) |>
summarise(
n_occ_hex = n(),
.groups = "drop"
) # 62 rows
d_filt3 <- ply_h3 |>
# source_obis, min_year >= 1975 (last 50 years)
filter(
source_obis == T,
!is.na(min_year),
min_year >= 1975
) |>
select(worms_id, h3_cell) |>
st_join(
ply_er |>
select(er_key)
) |>
filter(!is.na(er_key)) |>
st_drop_geometry() |>
group_by(worms_id, er_key) |>
summarise(
n_occ_hex = n(),
.groups = "drop"
) # 55 rows
d_filt4 <- ply_h3 |>
# source_obis, min_year >= 2000 (last 25 years)
filter(
source_obis == T,
!is.na(min_year),
min_year >= 2000
) |>
select(worms_id, h3_cell) |>
st_join(
ply_er |>
select(er_key)
) |>
filter(!is.na(er_key)) |>
st_drop_geometry() |>
group_by(worms_id, er_key) |>
summarise(
n_occ_hex = n(),
.groups = "drop"
) # 45 rows
# sequentially more restrective filters
# filter: 0 > 1 > 2 > 3 > 4
# n_rows: 75 > 69 > 62 > 55 > 45
d_spp_er_occ <- d_spp_er |>
select(-sp_cmn) |>
full_join(
d_filt0 |>
rename(n_occ_hex_f0 = n_occ_hex),
by = c("worms_id", "er_key")
) |>
full_join(
d_filt1 |>
rename(n_occ_hex_f1 = n_occ_hex),
by = c("worms_id", "er_key")
) |>
full_join(
d_filt2 |>
rename(n_occ_hex_f2 = n_occ_hex),
by = c("worms_id", "er_key")
) |>
full_join(
d_filt3 |>
rename(n_occ_hex_f3 = n_occ_hex),
by = c("worms_id", "er_key")
) |>
full_join(
d_filt4 |>
rename(n_occ_hex_f4 = n_occ_hex),
by = c("worms_id", "er_key")
) |>
left_join(
d_spp15 |>
select(sp_cmn, worms_id),
by = "worms_id"
) |>
relocate(sp_cmn, worms_id, er_key) |>
arrange(sp_cmn, er_key)
datatable(
d_spp_er_occ,
caption = "Species counts in Ecoregions for SDM vs occurrenced data"
)
```
## Compare Ecoregions from SDM vs Occurrence Data
```{r}
#| label: d_spp_er_occ_lgl to d_spp_er_best
d_spp_er_occ_lgl <- d_spp_er_occ |>
mutate(
has_sdm = !is.na(n_sdm_cell),
has_occ_f0 = !is.na(n_occ_hex_f0),
has_occ_f1 = !is.na(n_occ_hex_f1),
has_occ_f2 = !is.na(n_occ_hex_f2),
has_occ_f3 = !is.na(n_occ_hex_f3),
has_occ_f4 = !is.na(n_occ_hex_f4)
) |>
select(-starts_with("n_"))
# color code logicals with red (F) and green (T) and replace with icons
d_spp_er_occ_lgl |>
# mutate(
# sp_cmn_id = glue("{sp_cmn} ({worms_id})")
# ) |>
# relocate(sp_cmn_id) |>
# select(-sp_cmn, -worms_id) |>
select(-worms_id) |>
datatable(
caption = "Species counts in Ecoregions from SDM vs Occurrence data as logical values",
) |>
formatStyle(
columns = c(
"has_sdm",
"has_occ_f0",
"has_occ_f1",
"has_occ_f2",
"has_occ_f3",
"has_occ_f4"
),
target = "cell",
backgroundColor = styleEqual(
c(TRUE, FALSE),
c("lightgreen", "lightcoral")
),
color = styleEqual(
c(TRUE, FALSE),
c("darkgreen", "darkred")
)
)
# per sp_cmn and filter, tally # ecoregions in sdm vs occ that: match, miss, extra
d_spp_er_smry <- d_spp_er_occ_lgl |>
pivot_longer(
cols = starts_with("has_occ_"),
names_to = "filter",
names_prefix = "has_occ_",
values_to = "has_occ"
) |>
filter(
has_sdm | has_occ
) |>
mutate(
match_type = case_when(
has_sdm & has_occ ~ "n_er_equal",
has_sdm & !has_occ ~ "n_er_miss",
!has_sdm & has_occ ~ "n_er_add",
TRUE ~ "none"
)
) |>
group_by(sp_cmn, filter, match_type) |>
summarise(
n_ecoregions = n(),
.groups = "drop"
) |>
pivot_wider(
names_from = match_type,
values_from = n_ecoregions,
values_fill = 0
) |>
relocate(n_er_equal, n_er_miss, n_er_add, .after = filter) |>
arrange(sp_cmn, filter)
datatable(
d_spp_er_smry,
caption = "Summary of Ecoregion matches between SDM and Occurrence data per species and filter"
)
# determine which filter maximizes n_er_equal while minimizing n_er_miss and n_er_add
d_spp_er_best <- d_spp_er_smry |>
group_by(sp_cmn) |>
slice_max(
order_by = n_er_equal,
with_ties = T
) |>
slice_min(
order_by = n_er_miss,
with_ties = T
) |>
slice_min(
order_by = n_er_add,
with_ties = T
) |>
ungroup() |>
arrange(sp_cmn)
datatable(
d_spp_er_best,
caption = "Summary of Best Ecoregion matches between SDM and Occurrence data per species"
)
```