Transform downscaled AquaMaps data into DuckDB for efficient biodiversity analysis
Published
2025-05-27 20:28:46
1 Overview
This notebook ingests downscaled AquaMaps species distribution models (0.05° resolution) into a DuckDB database (msens_sdm) for efficient querying and biodiversity metric calculation. The database schema supports multiple SDM sources and includes spatial functions for raster generation and polygon summarization.
Code
librarian::shelf( DBI, dbplyr, dplyr, duckdb, fs, glue, here, leaflet, mapgl, purrr, sf, terra, tibble, tidyr, quiet = T )source(here("libs/db.R"))source(here("libs/am_functions.R"))source(here("libs/sdm_functions.R"))# set up directoriesis_server <-Sys.info()[["sysname"]] =="Linux"dir_data <-ifelse( is_server,"/share/data","~/My Drive/projects/msens/data")# database setupdb_path <-glue("{dir_data}/derived/msens_sdm.duckdb")con_sdm <-dbConnect(duckdb(), dbdir = db_path, read_only =FALSE)# dbDisconnect(con_sdm, shutdown = T)# duckdb_shutdown(duckdb())# Check if spatial extension is already loadedcheck_spatial <-function(con) {tryCatch({# Try to use a spatial function - if it works, extension is loaded res <-dbGetQuery(con, "SELECT ST_Point(0, 0)")return(TRUE) }, error =function(e) {return(FALSE) })}# Only install/load if not already availableif (!check_spatial(con_sdm)) { res <-dbExecute(con_sdm, "INSTALL spatial; LOAD spatial;")}
2 Database Schema
The database schema supports multiple species per model and includes biodiversity metrics with flexible regional aggregation.
erDiagram
%% https://mermaid.js.org/syntax/entityRelationshipDiagram.html
%% tables
dataset {
str ds_key PK "dataset key"
str name_short "short name: {source_broad} {taxa_groups} {response_type} in {regions}, {year_pub}"
str name_original "original dataset name"
str description
str citation
str source_broad "e.g. NOAA, AquaMaps, Duke, NCCOS"
str source_detail "e.g. NOAA Southeast Fisheries Science Center"
str regions "e.g. Global, Atlantic, Pacific, Gulf of America"
str response_type "occurrence, range, suitability, biomass, density, diversity"
str taxa_groups "fish, invertebrates, marine mammals, cetaceans, sea turtles, seabirds"
int year_pub "year published"
date date_obs_beg "first observation date"
date date_obs_end "last observation date"
date date_env_beg "first environmental data date"
date date_env_end "last environmental data date"
str link_info "primary information URL"
str link_download "download URL"
str link_metadata "metadata URL"
str links_other "other URLs space-separated"
dbl spatial_res_deg "spatial resolution in decimal degrees"
str temporal_res "temporal resolution"
date date_created "date added to database"
}
model {
int mdl_seq PK "sequential model ID"
str ds_key FK "dataset key"
str taxa "taxonomic identifier"
str time_period "time period"
str region "geographic region"
str mdl_type "occurrence, range, suitability, biomass, density, diversity"
str description
date date_created "date added to database"
}
species {
int sp_id PK "species ID"
str ds_key FK "dataset key"
str taxa FK "links to model.taxa"
str sp_key "species key"
int aphia_id "WoRMS Aphia ID"
int gbif_id "GBIF ID"
int itis_id "ITIS ID"
str scientific_name
str common_name
str sp_cat "fish, mollusk, crustacean, etc."
}
cell {
int cell_id PK "cell ID"
dbl lon "longitude"
dbl lat "latitude"
dbl area_km2 "cell area in square kilometers"
geo geom "cell geometry"
}
model_cell {
int cell_id FK "cell ID"
int mdl_seq FK "model sequential ID"
dbl value "model value"
}
metric {
int metric_seq PK "auto-incrementing metric ID"
str metric_type "species_richness, shannon_index, etc."
str description
date date_created "date metric was created"
}
cell_metric {
int cell_id FK "cell ID"
int metric_seq FK "metric ID"
dbl value "calculated metric value"
}
region {
int rgn_seq PK "auto-incrementing region ID"
str tbl_col "{table}.{column} e.g. planning_areas.pa_key"
str value "unique identifier value e.g. CHU"
str col_geom "geometry column name"
date date_created "date added to database"
}
region_metric {
int rgn_seq FK "region ID"
int metric_seq FK "metric ID"
dbl value "calculated metric value"
}
region_cell {
int rgn_seq FK "region ID"
int cell_id FK "cell ID"
int pct_covered "percentage of cell covered by region"
}
planning_areas {
str pa_key PK "planning area key"
str pa_name
geo geom "planning area geometry"
}
%% relationships
dataset ||--o{ model : ds_key
dataset ||--o{ species : ds_key
model ||--|{ species : "ds_key,taxa"
model ||--o{ model_cell : mdl_seq
cell ||--o{ model_cell : cell_id
cell ||--o{ cell_metric : cell_id
cell ||--o{ region_cell : cell_id
metric ||--o{ cell_metric : metric_seq
metric ||--o{ region_metric : metric_seq
region ||--o{ region_metric : rgn_seq
region ||--o{ region_cell : rgn_seq
3 Create Database Tables
Code
# drop existing tables if neededif (dbExistsTable(con_sdm, "model_cell") ||dbExistsTable(con_sdm, "sdm_values")) { tables_to_drop <-c("region_metric", "region_cell", "cell_metric", "env_layers", "model_cell", "sdm_values", "species", "model", "dataset", "cell", "sdm_cells", "metric", "region", "planning_areas","biodiv_metrics", "planning_area_metrics", "sdm_species", "sdm_models", "sdm_sources" )for (tbl in tables_to_drop) {if (dbExistsTable(con_sdm, tbl))dbExecute(con_sdm, glue("DROP TABLE {tbl} CASCADE")) }}# create tablesdbExecute(con_sdm, "CREATE TABLE dataset ( ds_key VARCHAR PRIMARY KEY, name_short VARCHAR NOT NULL, name_original VARCHAR, description TEXT, citation TEXT, source_broad VARCHAR, source_detail VARCHAR, regions VARCHAR, response_type VARCHAR, taxa_groups VARCHAR, year_pub INTEGER, date_obs_beg DATE, date_obs_end DATE, date_env_beg DATE, date_env_end DATE, link_info VARCHAR, link_download VARCHAR, link_metadata VARCHAR, links_other VARCHAR, spatial_res_deg DOUBLE, temporal_res VARCHAR, date_created DATE DEFAULT CURRENT_DATE)")dbExecute(con_sdm, "CREATE SEQUENCE model_mdl_seq_seq START 1")dbExecute(con_sdm, "CREATE TABLE model ( mdl_seq INTEGER PRIMARY KEY DEFAULT nextval('model_mdl_seq_seq'), ds_key VARCHAR REFERENCES dataset(ds_key), taxa VARCHAR, time_period VARCHAR, region VARCHAR, mdl_type VARCHAR, description TEXT, date_created DATE DEFAULT CURRENT_DATE)")dbExecute(con_sdm, "CREATE TABLE species ( sp_id INTEGER PRIMARY KEY, ds_key VARCHAR REFERENCES dataset(ds_key), taxa VARCHAR, sp_key VARCHAR NOT NULL, aphia_id INTEGER, gbif_id INTEGER, itis_id INTEGER, scientific_name VARCHAR, common_name VARCHAR, sp_cat VARCHAR)")dbExecute(con_sdm, "CREATE TABLE cell ( cell_id INTEGER PRIMARY KEY, lon DOUBLE NOT NULL, lat DOUBLE NOT NULL, area_km2 DOUBLE NOT NULL, geom GEOMETRY)")dbExecute(con_sdm, "CREATE TABLE model_cell ( cell_id INTEGER REFERENCES cell(cell_id), mdl_seq INTEGER REFERENCES model(mdl_seq), value DOUBLE, PRIMARY KEY (cell_id, mdl_seq))")dbExecute(con_sdm, "CREATE SEQUENCE metric_metric_seq_seq START 1")dbExecute(con_sdm, "CREATE TABLE metric ( metric_seq INTEGER PRIMARY KEY DEFAULT nextval('metric_metric_seq_seq'), metric_type VARCHAR NOT NULL, description TEXT, date_created DATE DEFAULT CURRENT_DATE)")dbExecute(con_sdm, "CREATE TABLE cell_metric ( cell_id INTEGER REFERENCES cell(cell_id), metric_seq INTEGER REFERENCES metric(metric_seq), value DOUBLE, PRIMARY KEY (cell_id, metric_seq))")dbExecute(con_sdm, "CREATE SEQUENCE region_rgn_seq_seq START 1")dbExecute(con_sdm, "CREATE TABLE region ( rgn_seq INTEGER PRIMARY KEY DEFAULT nextval('region_rgn_seq_seq'), tbl_col VARCHAR NOT NULL, value VARCHAR NOT NULL, col_geom VARCHAR NOT NULL, date_created DATE DEFAULT CURRENT_DATE)")dbExecute(con_sdm, "CREATE TABLE region_metric ( rgn_seq INTEGER REFERENCES region(rgn_seq), metric_seq INTEGER REFERENCES metric(metric_seq), value DOUBLE, PRIMARY KEY (rgn_seq, metric_seq))")dbExecute(con_sdm, "CREATE TABLE region_cell ( rgn_seq INTEGER REFERENCES region(rgn_seq), cell_id INTEGER REFERENCES cell(cell_id), pct_covered INTEGER, PRIMARY KEY (rgn_seq, cell_id))")dbExecute(con_sdm, "CREATE TABLE planning_areas ( pa_key VARCHAR PRIMARY KEY, pa_name VARCHAR, geom GEOMETRY)")# create indexesdbExecute(con_sdm, "CREATE INDEX idx_model_cell_cell_id ON model_cell(cell_id)")dbExecute(con_sdm, "CREATE INDEX idx_model_cell_mdl_seq ON model_cell(mdl_seq)")dbExecute(con_sdm, "CREATE INDEX idx_cell_coords ON cell(lon, lat)")dbExecute(con_sdm, "CREATE INDEX idx_species_taxa ON species(ds_key, taxa)")dbExecute(con_sdm, "CREATE INDEX idx_model_taxa ON model(ds_key, taxa)")dbExecute(con_sdm, "CREATE INDEX idx_region_cell_rgn ON region_cell(rgn_seq)")dbExecute(con_sdm, "CREATE INDEX idx_region_cell_cell ON region_cell(cell_id)")dbExecute(con_sdm, "CREATE SPATIAL INDEX idx_cell_geom ON cell(geom)")dbExecute(con_sdm, "CREATE SPATIAL INDEX idx_planning_areas_geom ON planning_areas(geom)")
4 Ingest AquaMaps Source
Code
# add AquaMaps as a sourceaquamaps_dataset <-tibble(ds_key ="am_0.05",name_short ="AquaMaps fish suitability in Global, 2023",name_original ="AquaMaps: Predicted range maps for aquatic species",description ="AquaMaps species distribution models downscaled to 0.05° using Bio-Oracle v3 environmental layers",citation ="Kaschner et al. (2023) AquaMaps: Predicted range maps for aquatic species. Version 10/2019. www.aquamaps.org",source_broad ="AquaMaps",source_detail ="AquaMaps.org collaborative project",regions ="Global",response_type ="suitability",taxa_groups ="fish",year_pub =2023,date_obs_beg =as.Date("1990-01-01"),date_obs_end =as.Date("2020-12-31"),date_env_beg =as.Date("2000-01-01"),date_env_end =as.Date("2020-12-31"),link_info ="https://www.aquamaps.org",link_download ="https://www.aquamaps.org/download/main.php",link_metadata ="https://www.aquamaps.org/main/alg_2019.php",links_other ="https://www.fishbase.org https://www.sealifebase.org",spatial_res_deg =0.05,temporal_res ="static")dbWriteTable(con_sdm, "dataset", aquamaps_dataset, append =TRUE)
5 Load Planning Areas
Code
# read planning areas from existing databaseply_pa <-st_read(con, "ply_planareas_2025")# simplify and load into DuckDBplanning_areas <- ply_pa |>select(pa_key = planarea_key, pa_name = planarea_name, geom) |># TODO: keep originalst_transform(4326)# write to DuckDB (convert geometry to WKT first)pa_df <- planning_areas |>mutate(geom_wkt =st_as_text(geom)) |>st_drop_geometry()dbExecute(con_sdm, "DELETE FROM planning_areas")for (i in1:nrow(pa_df)) {dbExecute(con_sdm, glue(" INSERT INTO planning_areas (pa_key, pa_name, geom) VALUES ('{pa_df$pa_key[i]}', '{pa_df$pa_name[i]}', ST_GeomFromText('{pa_df$geom_wkt[i]}')) "))}
6 Create 0.05° Grid Cells
Code
# create global 0.05 degree raster templateres <-0.05r_global <- terra::rast(xmin =-180, xmax =180,ymin =-90, ymax =90,resolution = res,crs ="EPSG:4326")# get cell IDs and coordinates using terracell_ids <- terra::cells(r_global)coords <- terra::xyFromCell(r_global, cell_ids)# calculate cell areas in km2# for 0.05 degree cells, area varies by latitudecell_areas <- terra::cellSize(r_global, unit ="km")areas_km2 <- terra::values(cell_areas)[cell_ids]# create grid cells dataframegrid_cells <-tibble(cell_id = cell_ids,lon = coords[, 1],lat = coords[, 2],area_km2 = areas_km2)# batch insert cellsbatch_size <-10000n_batches <-ceiling(nrow(grid_cells) / batch_size)message(glue("Inserting {nrow(grid_cells)} grid cells in {n_batches} batches..."))for (i in1:n_batches) { start_idx <- (i -1) * batch_size +1 end_idx <-min(i * batch_size, nrow(grid_cells)) batch <- grid_cells[start_idx:end_idx, ]# create WKT points for geometries batch <- batch |>mutate(geom_wkt =glue("POINT({lon} {lat})") )# insert batchfor (j in1:nrow(batch)) {dbExecute(con_sdm, glue(" INSERT INTO cell (cell_id, lon, lat, area_km2, geom) VALUES ({batch$cell_id[j]}, {batch$lon[j]}, {batch$lat[j]}, {batch$area_km2[j]}, ST_GeomFromText('{batch$geom_wkt[j]}')) ")) }if (i %%10==0) {message(glue(" Completed batch {i}/{n_batches}")) }}# save the global raster template for later usesaveRDS(r_global, glue("{dir_data}/derived/r_global_0.05.rds"))
7 Process Aquamaps downsized models
Code
# this chunk assumes the iterate_species_in_planareas chunk from # ingest_aquamaps_res05.qmd has been run and produced species rasters# get list of species rastersdir_tifs <-glue("{dir_data}/derived/aquamaps.org/spp_cells0.05_cog")spp_tifs <-list.files(species_dir, pattern ="*.tif$", full.names =TRUE)# load global raster templater_global <-readRDS(glue("{dir_data}/derived/r_global_0.05.rds"))# process each speciessp_id_counter <-1for (sp_tif in spp_tifs[1:10]) { # process first 10 for testing sp_key <-basename(sp_file) |>path_ext_remove()message(glue("Processing species: {sp_key}"))# read species raster r_sp <-rast(sp_tif)# get species info from original database sp_info <-tbl(con, "spp") |>filter(sp_key ==!!sp_key) |>collect()# create model for this species (one model per species for AquaMaps) model_info <-tibble(ds_key ="am_0.05",taxa = sp_key, # for AquaMaps, taxa equals sp_keytime_period ="2019",region ="Global",mdl_type ="suitability",description =glue("AquaMaps suitability for {sp_info$genus[1]} {sp_info$species[1]}, interpolated to 0.05° resolution") )dbWriteTable(con_sdm, "model", model_info, append =TRUE)# get the mdl_seq that was just created mdl_seq <-dbGetQuery(con_sdm, glue(" SELECT mdl_seq FROM model WHERE ds_key = '{model_info$ds_key}' AND taxa = '{model_info$taxa}' ORDER BY mdl_seq DESC LIMIT 1 "))$mdl_seq# determine species category sp_cat <-case_when( sp_info$class[1] =="Actinopterygii"~"fish", sp_info$class[1] =="Chondrichthyes"~"fish", sp_info$class[1] =="Mammalia"~"marine mammal", sp_info$class[1] =="Reptilia"~"sea turtle", sp_info$phylum[1] =="Mollusca"~"mollusk", sp_info$phylum[1] =="Arthropoda"&& sp_info$subphylum[1] =="Crustacea"~"crustacean",TRUE~"other" )# add species to database species_info <-tibble(sp_id = sp_id_counter,ds_key ="am_0.05",taxa = sp_key,sp_key = sp_key,aphia_id = sp_info$aphia_id[1],gbif_id =NA_integer_, # TODO: lookup from GBIFitis_id =NA_integer_, # TODO: lookup from ITISscientific_name =glue("{sp_info$genus[1]} {sp_info$species[1]}"),common_name = sp_info$common_name[1],sp_cat = sp_cat)dbWriteTable(con_sdm, "species", species_info, append =TRUE)# extract values using terra::cells to ensure matching# get cell IDs that have values cell_values <- terra::values(r_sp, mat =FALSE, dataframe =TRUE, na.rm =TRUE) cell_numbers <- terra::cells(r_sp)[!is.na(terra::values(r_sp))]# create model_cell records model_cell_df <-tibble(cell_id = cell_numbers,mdl_seq = mdl_seq,value = cell_values[[1]] /100# convert from percentage to probability )# batch insert valuesbatch_insert_values(con_sdm, model_cell_df, "model_cell") sp_id_counter <- sp_id_counter +1}
8 Calculate Biodiversity Metrics
Code
# first create metric definitionsmetrics_to_create <- tibble::tribble(~metric_type, ~description,"species_richness", "Number of species with suitability > 0.5","shannon_index", "Shannon diversity index based on suitability values","simpson_index", "Simpson diversity index based on suitability values")for (i in1:nrow(metrics_to_create)) {dbExecute(con_sdm, glue(" INSERT INTO metric (metric_type, description) VALUES ('{metrics_to_create$metric_type[i]}', '{metrics_to_create$description[i]}') "))}# get metric IDsmetric_ids <-dbGetQuery(con_sdm, "SELECT metric_seq, metric_type FROM metric")# calculate species richness per cellmessage("Calculating species richness...")richness_metric_seq <- metric_ids$metric_seq[metric_ids$metric_type =="species_richness"]dbExecute(con_sdm, glue("INSERT INTO cell_metric (cell_id, metric_seq, value)SELECT mc.cell_id, {richness_metric_seq} as metric_seq, COUNT(DISTINCT m.taxa) as valueFROM model_cell mcJOIN model m ON mc.mdl_seq = m.mdl_seqWHERE mc.value > 0.5 -- presence thresholdGROUP BY mc.cell_id"))# calculate Shannon diversity indexmessage("Calculating Shannon diversity...")shannon_metric_seq <- metric_ids$metric_seq[metric_ids$metric_type =="shannon_index"]dbExecute(con_sdm, glue("INSERT INTO cell_metric (cell_id, metric_seq, value)SELECT cell_id, {shannon_metric_seq} as metric_seq, -SUM(p * LN(p)) as valueFROM ( SELECT mc.cell_id, mc.value / SUM(mc.value) OVER (PARTITION BY mc.cell_id) as p FROM model_cell mc WHERE mc.value > 0) tWHERE p > 0GROUP BY cell_id"))
9 Generate Biodiversity Raster
Code
# load global raster templater_global <-readRDS(glue("{dir_data}/derived/r_global_0.05.rds"))# get species richness metric IDrichness_metric_seq <-dbGetQuery(con_sdm, " SELECT metric_seq FROM metric WHERE metric_type = 'species_richness'")$metric_seq# query species richness for a regionrichness_df <-dbGetQuery(con_sdm, glue("SELECT cm.cell_id, cm.value as species_richnessFROM cell_metric cmWHERE cm.metric_seq = {richness_metric_seq} AND cm.cell_id IN ( SELECT cell_id FROM cell WHERE lon BETWEEN -180 AND -60 -- limit to a region for testing AND lat BETWEEN 20 AND 50 )"))# convert to raster using the new functionr_richness <-create_raster_from_df( richness_df, r_template = r_global,value_cols ="species_richness")# display with mapglmapgl_rast <-function(r, palette ="viridis", layer_name ="SDM") {# convert raster to data frame r_df <-as.data.frame(r, xy =TRUE) |>rename(value =3)# create mapgl visualizationmapgl() |>add_raster_tile_layer(id ="sdm_raster",source = r,source_layer = layer_name,color_palette = palette )}# visualizemapgl_rast(r_richness, palette ="YlOrRd", layer_name ="Species Richness")
10 Summarize by Planning Area
Code
# first, register planning areas as regionsplanning_area_regions <-dbGetQuery(con_sdm, " SELECT pa_key, pa_name FROM planning_areas")for (i in1:nrow(planning_area_regions)) {dbExecute(con_sdm, glue(" INSERT INTO region (tbl_col, value, col_geom) VALUES ('planning_areas.pa_key', '{planning_area_regions$pa_key[i]}', 'geom') "))}# get region and metric IDsregion_ids <-dbGetQuery(con_sdm, " SELECT rgn_seq, value FROM region WHERE tbl_col = 'planning_areas.pa_key'")richness_metric_seq <-dbGetQuery(con_sdm, " SELECT metric_seq FROM metric WHERE metric_type = 'species_richness'")$metric_seq# calculate average species richness per planning areamessage("Calculating planning area metrics...")for (i in1:nrow(region_ids)) { pa_key <- region_ids$value[i] rgn_seq <- region_ids$rgn_seq[i]# calculate metric for this region result <-dbGetQuery(con_sdm, glue(" SELECT AVG(cm.value) as avg_value FROM planning_areas p JOIN cell c ON ST_Within(c.geom, p.geom) JOIN cell_metric cm ON c.cell_id = cm.cell_id WHERE p.pa_key = '{pa_key}' AND cm.metric_seq = {richness_metric_seq} "))if (!is.na(result$avg_value[1])) {dbExecute(con_sdm, glue(" INSERT INTO region_metric (rgn_seq, metric_seq, value) VALUES ({rgn_seq}, {richness_metric_seq}, {result$avg_value[1]}) ")) }}# query resultspa_metrics <-dbGetQuery(con_sdm, "SELECT p.pa_key, p.pa_name, rm.value as avg_richnessFROM region rJOIN region_metric rm ON r.rgn_seq = rm.rgn_seqJOIN planning_areas p ON r.value = p.pa_keyWHERE r.tbl_col = 'planning_areas.pa_key' AND rm.metric_seq = (SELECT metric_seq FROM metric WHERE metric_type = 'species_richness')ORDER BY rm.value DESC")print(pa_metrics)
11 Add Environmental Layers
Code
# example: add primary productivity from ingest_productivity.qmdnpp_tif <-here("data/derived/vgpm-viirs_2023-daily-avg.tif")if (file.exists(npp_tif)) { r_npp <-rast(npp_tif)# resample to 0.05 degree resolution r_template <-rast(extent =ext(-180, 180, -90, 90),resolution =0.05,crs ="EPSG:4326" ) r_npp_res <-resample(r_npp, r_template, method ="bilinear")# extract values npp_df <-as.data.frame(r_npp_res, xy =TRUE) |>filter(!is.na(npp_daily_avg)) |>mutate(cell_lon =round(x /0.05) *0.05,cell_lat =round(y /0.05) *0.05 )# insert into env_layers# (similar process as species values)}
12 Export to Parquet
Code
# export tables to parquet for efficient storageoutput_dir <-glue("{dir_data}/derived/sdm_parquet")dir_create(output_dir)# export each tabletables_to_export <-c("dataset", "model", "species", "cell", "model_cell", "metric","cell_metric", "region", "region_metric", "region_cell","planning_areas")for (tbl in tables_to_export) {message(glue("Exporting {tbl} to parquet...")) df <-dbGetQuery(con_sdm, glue("SELECT * FROM {tbl}")) arrow::write_parquet( df, file.path(output_dir, glue("{tbl}.parquet")),compression ="snappy" )}
13 Close Connection
Code
dbDisconnect(con_sdm)
Source Code
---title: "Ingest AquaMaps to SDM DuckDB"subtitle: "Transform downscaled AquaMaps data into DuckDB for efficient biodiversity analysis"format: html: code-fold: true code-tools: trueeditor_options: chunk_output_type: console---## OverviewThis notebook ingests downscaled AquaMaps species distribution models (0.05° resolution) into a DuckDB database (`msens_sdm`) for efficient querying and biodiversity metric calculation. The database schema supports multiple SDM sources and includes spatial functions for raster generation and polygon summarization.```{r setup}#| message: false#| warning: falselibrarian::shelf( DBI, dbplyr, dplyr, duckdb, fs, glue, here, leaflet, mapgl, purrr, sf, terra, tibble, tidyr, quiet = T )source(here("libs/db.R"))source(here("libs/am_functions.R"))source(here("libs/sdm_functions.R"))# set up directoriesis_server <- Sys.info()[["sysname"]] == "Linux"dir_data <- ifelse( is_server, "/share/data", "~/My Drive/projects/msens/data")# database setupdb_path <- glue("{dir_data}/derived/msens_sdm.duckdb")con_sdm <- dbConnect(duckdb(), dbdir = db_path, read_only = FALSE)# dbDisconnect(con_sdm, shutdown = T)# duckdb_shutdown(duckdb())# Check if spatial extension is already loadedcheck_spatial <- function(con) { tryCatch({ # Try to use a spatial function - if it works, extension is loaded res <- dbGetQuery(con, "SELECT ST_Point(0, 0)") return(TRUE) }, error = function(e) { return(FALSE) })}# Only install/load if not already availableif (!check_spatial(con_sdm)) { res <- dbExecute(con_sdm, "INSTALL spatial; LOAD spatial;")}```## Database SchemaThe database schema supports multiple species per model and includes biodiversity metrics with flexible regional aggregation.```{mermaid}erDiagram%% https://mermaid.js.org/syntax/entityRelationshipDiagram.html%% tablesdataset { str ds_key PK "dataset key" str name_short "short name: {source_broad} {taxa_groups} {response_type} in {regions}, {year_pub}" str name_original "original dataset name" str description str citation str source_broad "e.g. NOAA, AquaMaps, Duke, NCCOS" str source_detail "e.g. NOAA Southeast Fisheries Science Center" str regions "e.g. Global, Atlantic, Pacific, Gulf of America" str response_type "occurrence, range, suitability, biomass, density, diversity" str taxa_groups "fish, invertebrates, marine mammals, cetaceans, sea turtles, seabirds" int year_pub "year published" date date_obs_beg "first observation date" date date_obs_end "last observation date" date date_env_beg "first environmental data date" date date_env_end "last environmental data date" str link_info "primary information URL" str link_download "download URL" str link_metadata "metadata URL" str links_other "other URLs space-separated" dbl spatial_res_deg "spatial resolution in decimal degrees" str temporal_res "temporal resolution" date date_created "date added to database"}model { int mdl_seq PK "sequential model ID" str ds_key FK "dataset key" str taxa "taxonomic identifier" str time_period "time period" str region "geographic region" str mdl_type "occurrence, range, suitability, biomass, density, diversity" str description date date_created "date added to database"}species { int sp_id PK "species ID" str ds_key FK "dataset key" str taxa FK "links to model.taxa" str sp_key "species key" int aphia_id "WoRMS Aphia ID" int gbif_id "GBIF ID" int itis_id "ITIS ID" str scientific_name str common_name str sp_cat "fish, mollusk, crustacean, etc."}cell { int cell_id PK "cell ID" dbl lon "longitude" dbl lat "latitude" dbl area_km2 "cell area in square kilometers" geo geom "cell geometry"}model_cell { int cell_id FK "cell ID" int mdl_seq FK "model sequential ID" dbl value "model value"}metric { int metric_seq PK "auto-incrementing metric ID" str metric_type "species_richness, shannon_index, etc." str description date date_created "date metric was created"}cell_metric { int cell_id FK "cell ID" int metric_seq FK "metric ID" dbl value "calculated metric value"}region { int rgn_seq PK "auto-incrementing region ID" str tbl_col "{table}.{column} e.g. planning_areas.pa_key" str value "unique identifier value e.g. CHU" str col_geom "geometry column name" date date_created "date added to database"}region_metric { int rgn_seq FK "region ID" int metric_seq FK "metric ID" dbl value "calculated metric value"}region_cell { int rgn_seq FK "region ID" int cell_id FK "cell ID" int pct_covered "percentage of cell covered by region"}planning_areas { str pa_key PK "planning area key" str pa_name geo geom "planning area geometry"}%% relationshipsdataset ||--o{ model : ds_keydataset ||--o{ species : ds_keymodel ||--|{ species : "ds_key,taxa"model ||--o{ model_cell : mdl_seqcell ||--o{ model_cell : cell_idcell ||--o{ cell_metric : cell_idcell ||--o{ region_cell : cell_idmetric ||--o{ cell_metric : metric_seqmetric ||--o{ region_metric : metric_seqregion ||--o{ region_metric : rgn_seqregion ||--o{ region_cell : rgn_seq``````{r tmp_not_evel, include=FALSE}knitr::opts_chunk$set(eval = F)```## Create Database Tables```{r create_tables}#| message: false# drop existing tables if neededif (dbExistsTable(con_sdm, "model_cell") || dbExistsTable(con_sdm, "sdm_values")) { tables_to_drop <- c( "region_metric", "region_cell", "cell_metric", "env_layers", "model_cell", "sdm_values", "species", "model", "dataset", "cell", "sdm_cells", "metric", "region", "planning_areas", "biodiv_metrics", "planning_area_metrics", "sdm_species", "sdm_models", "sdm_sources" ) for (tbl in tables_to_drop) { if (dbExistsTable(con_sdm, tbl)) dbExecute(con_sdm, glue("DROP TABLE {tbl} CASCADE")) }}# create tablesdbExecute(con_sdm, "CREATE TABLE dataset ( ds_key VARCHAR PRIMARY KEY, name_short VARCHAR NOT NULL, name_original VARCHAR, description TEXT, citation TEXT, source_broad VARCHAR, source_detail VARCHAR, regions VARCHAR, response_type VARCHAR, taxa_groups VARCHAR, year_pub INTEGER, date_obs_beg DATE, date_obs_end DATE, date_env_beg DATE, date_env_end DATE, link_info VARCHAR, link_download VARCHAR, link_metadata VARCHAR, links_other VARCHAR, spatial_res_deg DOUBLE, temporal_res VARCHAR, date_created DATE DEFAULT CURRENT_DATE)")dbExecute(con_sdm, "CREATE SEQUENCE model_mdl_seq_seq START 1")dbExecute(con_sdm, "CREATE TABLE model ( mdl_seq INTEGER PRIMARY KEY DEFAULT nextval('model_mdl_seq_seq'), ds_key VARCHAR REFERENCES dataset(ds_key), taxa VARCHAR, time_period VARCHAR, region VARCHAR, mdl_type VARCHAR, description TEXT, date_created DATE DEFAULT CURRENT_DATE)")dbExecute(con_sdm, "CREATE TABLE species ( sp_id INTEGER PRIMARY KEY, ds_key VARCHAR REFERENCES dataset(ds_key), taxa VARCHAR, sp_key VARCHAR NOT NULL, aphia_id INTEGER, gbif_id INTEGER, itis_id INTEGER, scientific_name VARCHAR, common_name VARCHAR, sp_cat VARCHAR)")dbExecute(con_sdm, "CREATE TABLE cell ( cell_id INTEGER PRIMARY KEY, lon DOUBLE NOT NULL, lat DOUBLE NOT NULL, area_km2 DOUBLE NOT NULL, geom GEOMETRY)")dbExecute(con_sdm, "CREATE TABLE model_cell ( cell_id INTEGER REFERENCES cell(cell_id), mdl_seq INTEGER REFERENCES model(mdl_seq), value DOUBLE, PRIMARY KEY (cell_id, mdl_seq))")dbExecute(con_sdm, "CREATE SEQUENCE metric_metric_seq_seq START 1")dbExecute(con_sdm, "CREATE TABLE metric ( metric_seq INTEGER PRIMARY KEY DEFAULT nextval('metric_metric_seq_seq'), metric_type VARCHAR NOT NULL, description TEXT, date_created DATE DEFAULT CURRENT_DATE)")dbExecute(con_sdm, "CREATE TABLE cell_metric ( cell_id INTEGER REFERENCES cell(cell_id), metric_seq INTEGER REFERENCES metric(metric_seq), value DOUBLE, PRIMARY KEY (cell_id, metric_seq))")dbExecute(con_sdm, "CREATE SEQUENCE region_rgn_seq_seq START 1")dbExecute(con_sdm, "CREATE TABLE region ( rgn_seq INTEGER PRIMARY KEY DEFAULT nextval('region_rgn_seq_seq'), tbl_col VARCHAR NOT NULL, value VARCHAR NOT NULL, col_geom VARCHAR NOT NULL, date_created DATE DEFAULT CURRENT_DATE)")dbExecute(con_sdm, "CREATE TABLE region_metric ( rgn_seq INTEGER REFERENCES region(rgn_seq), metric_seq INTEGER REFERENCES metric(metric_seq), value DOUBLE, PRIMARY KEY (rgn_seq, metric_seq))")dbExecute(con_sdm, "CREATE TABLE region_cell ( rgn_seq INTEGER REFERENCES region(rgn_seq), cell_id INTEGER REFERENCES cell(cell_id), pct_covered INTEGER, PRIMARY KEY (rgn_seq, cell_id))")dbExecute(con_sdm, "CREATE TABLE planning_areas ( pa_key VARCHAR PRIMARY KEY, pa_name VARCHAR, geom GEOMETRY)")# create indexesdbExecute(con_sdm, "CREATE INDEX idx_model_cell_cell_id ON model_cell(cell_id)")dbExecute(con_sdm, "CREATE INDEX idx_model_cell_mdl_seq ON model_cell(mdl_seq)")dbExecute(con_sdm, "CREATE INDEX idx_cell_coords ON cell(lon, lat)")dbExecute(con_sdm, "CREATE INDEX idx_species_taxa ON species(ds_key, taxa)")dbExecute(con_sdm, "CREATE INDEX idx_model_taxa ON model(ds_key, taxa)")dbExecute(con_sdm, "CREATE INDEX idx_region_cell_rgn ON region_cell(rgn_seq)")dbExecute(con_sdm, "CREATE INDEX idx_region_cell_cell ON region_cell(cell_id)")dbExecute(con_sdm, "CREATE SPATIAL INDEX idx_cell_geom ON cell(geom)")dbExecute(con_sdm, "CREATE SPATIAL INDEX idx_planning_areas_geom ON planning_areas(geom)")```## Ingest AquaMaps Source```{r ingest_source}# add AquaMaps as a sourceaquamaps_dataset <- tibble( ds_key = "am_0.05", name_short = "AquaMaps fish suitability in Global, 2023", name_original = "AquaMaps: Predicted range maps for aquatic species", description = "AquaMaps species distribution models downscaled to 0.05° using Bio-Oracle v3 environmental layers", citation = "Kaschner et al. (2023) AquaMaps: Predicted range maps for aquatic species. Version 10/2019. www.aquamaps.org", source_broad = "AquaMaps", source_detail = "AquaMaps.org collaborative project", regions = "Global", response_type = "suitability", taxa_groups = "fish", year_pub = 2023, date_obs_beg = as.Date("1990-01-01"), date_obs_end = as.Date("2020-12-31"), date_env_beg = as.Date("2000-01-01"), date_env_end = as.Date("2020-12-31"), link_info = "https://www.aquamaps.org", link_download = "https://www.aquamaps.org/download/main.php", link_metadata = "https://www.aquamaps.org/main/alg_2019.php", links_other = "https://www.fishbase.org https://www.sealifebase.org", spatial_res_deg = 0.05, temporal_res = "static")dbWriteTable(con_sdm, "dataset", aquamaps_dataset, append = TRUE)```## Load Planning Areas```{r load_planning_areas}# read planning areas from existing databaseply_pa <- st_read(con, "ply_planareas_2025")# simplify and load into DuckDBplanning_areas <- ply_pa |> select(pa_key = planarea_key, pa_name = planarea_name, geom) |> # TODO: keep original st_transform(4326)# write to DuckDB (convert geometry to WKT first)pa_df <- planning_areas |> mutate(geom_wkt = st_as_text(geom)) |> st_drop_geometry()dbExecute(con_sdm, "DELETE FROM planning_areas")for (i in 1:nrow(pa_df)) { dbExecute(con_sdm, glue(" INSERT INTO planning_areas (pa_key, pa_name, geom) VALUES ('{pa_df$pa_key[i]}', '{pa_df$pa_name[i]}', ST_GeomFromText('{pa_df$geom_wkt[i]}')) "))}```## Create 0.05° Grid Cells```{r create_grid}#| message: false# create global 0.05 degree raster templateres <- 0.05r_global <- terra::rast( xmin = -180, xmax = 180, ymin = -90, ymax = 90, resolution = res, crs = "EPSG:4326")# get cell IDs and coordinates using terracell_ids <- terra::cells(r_global)coords <- terra::xyFromCell(r_global, cell_ids)# calculate cell areas in km2# for 0.05 degree cells, area varies by latitudecell_areas <- terra::cellSize(r_global, unit = "km")areas_km2 <- terra::values(cell_areas)[cell_ids]# create grid cells dataframegrid_cells <- tibble( cell_id = cell_ids, lon = coords[, 1], lat = coords[, 2], area_km2 = areas_km2)# batch insert cellsbatch_size <- 10000n_batches <- ceiling(nrow(grid_cells) / batch_size)message(glue("Inserting {nrow(grid_cells)} grid cells in {n_batches} batches..."))for (i in 1:n_batches) { start_idx <- (i - 1) * batch_size + 1 end_idx <- min(i * batch_size, nrow(grid_cells)) batch <- grid_cells[start_idx:end_idx, ] # create WKT points for geometries batch <- batch |> mutate( geom_wkt = glue("POINT({lon} {lat})") ) # insert batch for (j in 1:nrow(batch)) { dbExecute(con_sdm, glue(" INSERT INTO cell (cell_id, lon, lat, area_km2, geom) VALUES ({batch$cell_id[j]}, {batch$lon[j]}, {batch$lat[j]}, {batch$area_km2[j]}, ST_GeomFromText('{batch$geom_wkt[j]}')) ")) } if (i %% 10 == 0) { message(glue(" Completed batch {i}/{n_batches}")) }}# save the global raster template for later usesaveRDS(r_global, glue("{dir_data}/derived/r_global_0.05.rds"))```## Process Aquamaps downsized models```{r process_species}#| eval: false# this chunk assumes the iterate_species_in_planareas chunk from # ingest_aquamaps_res05.qmd has been run and produced species rasters# get list of species rastersdir_tifs <- glue("{dir_data}/derived/aquamaps.org/spp_cells0.05_cog")spp_tifs <- list.files(species_dir, pattern = "*.tif$", full.names = TRUE)# load global raster templater_global <- readRDS(glue("{dir_data}/derived/r_global_0.05.rds"))# process each speciessp_id_counter <- 1for (sp_tif in spp_tifs[1:10]) { # process first 10 for testing sp_key <- basename(sp_file) |> path_ext_remove() message(glue("Processing species: {sp_key}")) # read species raster r_sp <- rast(sp_tif) # get species info from original database sp_info <- tbl(con, "spp") |> filter(sp_key == !!sp_key) |> collect() # create model for this species (one model per species for AquaMaps) model_info <- tibble( ds_key = "am_0.05", taxa = sp_key, # for AquaMaps, taxa equals sp_key time_period = "2019", region = "Global", mdl_type = "suitability", description = glue( "AquaMaps suitability for {sp_info$genus[1]} {sp_info$species[1]}, interpolated to 0.05° resolution") ) dbWriteTable(con_sdm, "model", model_info, append = TRUE) # get the mdl_seq that was just created mdl_seq <- dbGetQuery(con_sdm, glue(" SELECT mdl_seq FROM model WHERE ds_key = '{model_info$ds_key}' AND taxa = '{model_info$taxa}' ORDER BY mdl_seq DESC LIMIT 1 "))$mdl_seq # determine species category sp_cat <- case_when( sp_info$class[1] == "Actinopterygii" ~ "fish", sp_info$class[1] == "Chondrichthyes" ~ "fish", sp_info$class[1] == "Mammalia" ~ "marine mammal", sp_info$class[1] == "Reptilia" ~ "sea turtle", sp_info$phylum[1] == "Mollusca" ~ "mollusk", sp_info$phylum[1] == "Arthropoda" && sp_info$subphylum[1] == "Crustacea" ~ "crustacean", TRUE ~ "other" ) # add species to database species_info <- tibble( sp_id = sp_id_counter, ds_key = "am_0.05", taxa = sp_key, sp_key = sp_key, aphia_id = sp_info$aphia_id[1], gbif_id = NA_integer_, # TODO: lookup from GBIF itis_id = NA_integer_, # TODO: lookup from ITIS scientific_name = glue("{sp_info$genus[1]} {sp_info$species[1]}"), common_name = sp_info$common_name[1], sp_cat = sp_cat) dbWriteTable(con_sdm, "species", species_info, append = TRUE) # extract values using terra::cells to ensure matching # get cell IDs that have values cell_values <- terra::values(r_sp, mat = FALSE, dataframe = TRUE, na.rm = TRUE) cell_numbers <- terra::cells(r_sp)[!is.na(terra::values(r_sp))] # create model_cell records model_cell_df <- tibble( cell_id = cell_numbers, mdl_seq = mdl_seq, value = cell_values[[1]] / 100 # convert from percentage to probability ) # batch insert values batch_insert_values(con_sdm, model_cell_df, "model_cell") sp_id_counter <- sp_id_counter + 1}```## Calculate Biodiversity Metrics```{r calc_biodiv_metrics}#| eval: false# first create metric definitionsmetrics_to_create <- tibble::tribble( ~metric_type, ~description, "species_richness", "Number of species with suitability > 0.5", "shannon_index", "Shannon diversity index based on suitability values", "simpson_index", "Simpson diversity index based on suitability values")for (i in 1:nrow(metrics_to_create)) { dbExecute(con_sdm, glue(" INSERT INTO metric (metric_type, description) VALUES ('{metrics_to_create$metric_type[i]}', '{metrics_to_create$description[i]}') "))}# get metric IDsmetric_ids <- dbGetQuery(con_sdm, "SELECT metric_seq, metric_type FROM metric")# calculate species richness per cellmessage("Calculating species richness...")richness_metric_seq <- metric_ids$metric_seq[metric_ids$metric_type == "species_richness"]dbExecute(con_sdm, glue("INSERT INTO cell_metric (cell_id, metric_seq, value)SELECT mc.cell_id, {richness_metric_seq} as metric_seq, COUNT(DISTINCT m.taxa) as valueFROM model_cell mcJOIN model m ON mc.mdl_seq = m.mdl_seqWHERE mc.value > 0.5 -- presence thresholdGROUP BY mc.cell_id"))# calculate Shannon diversity indexmessage("Calculating Shannon diversity...")shannon_metric_seq <- metric_ids$metric_seq[metric_ids$metric_type == "shannon_index"]dbExecute(con_sdm, glue("INSERT INTO cell_metric (cell_id, metric_seq, value)SELECT cell_id, {shannon_metric_seq} as metric_seq, -SUM(p * LN(p)) as valueFROM ( SELECT mc.cell_id, mc.value / SUM(mc.value) OVER (PARTITION BY mc.cell_id) as p FROM model_cell mc WHERE mc.value > 0) tWHERE p > 0GROUP BY cell_id"))```## Generate Biodiversity Raster```{r generate_raster}#| eval: false# load global raster templater_global <- readRDS(glue("{dir_data}/derived/r_global_0.05.rds"))# get species richness metric IDrichness_metric_seq <- dbGetQuery(con_sdm, " SELECT metric_seq FROM metric WHERE metric_type = 'species_richness'")$metric_seq# query species richness for a regionrichness_df <- dbGetQuery(con_sdm, glue("SELECT cm.cell_id, cm.value as species_richnessFROM cell_metric cmWHERE cm.metric_seq = {richness_metric_seq} AND cm.cell_id IN ( SELECT cell_id FROM cell WHERE lon BETWEEN -180 AND -60 -- limit to a region for testing AND lat BETWEEN 20 AND 50 )"))# convert to raster using the new functionr_richness <- create_raster_from_df( richness_df, r_template = r_global, value_cols = "species_richness")# display with mapglmapgl_rast <- function(r, palette = "viridis", layer_name = "SDM") { # convert raster to data frame r_df <- as.data.frame(r, xy = TRUE) |> rename(value = 3) # create mapgl visualization mapgl() |> add_raster_tile_layer( id = "sdm_raster", source = r, source_layer = layer_name, color_palette = palette )}# visualizemapgl_rast(r_richness, palette = "YlOrRd", layer_name = "Species Richness")```## Summarize by Planning Area```{r summarize_planning_areas}#| eval: false# first, register planning areas as regionsplanning_area_regions <- dbGetQuery(con_sdm, " SELECT pa_key, pa_name FROM planning_areas")for (i in 1:nrow(planning_area_regions)) { dbExecute(con_sdm, glue(" INSERT INTO region (tbl_col, value, col_geom) VALUES ('planning_areas.pa_key', '{planning_area_regions$pa_key[i]}', 'geom') "))}# get region and metric IDsregion_ids <- dbGetQuery(con_sdm, " SELECT rgn_seq, value FROM region WHERE tbl_col = 'planning_areas.pa_key'")richness_metric_seq <- dbGetQuery(con_sdm, " SELECT metric_seq FROM metric WHERE metric_type = 'species_richness'")$metric_seq# calculate average species richness per planning areamessage("Calculating planning area metrics...")for (i in 1:nrow(region_ids)) { pa_key <- region_ids$value[i] rgn_seq <- region_ids$rgn_seq[i] # calculate metric for this region result <- dbGetQuery(con_sdm, glue(" SELECT AVG(cm.value) as avg_value FROM planning_areas p JOIN cell c ON ST_Within(c.geom, p.geom) JOIN cell_metric cm ON c.cell_id = cm.cell_id WHERE p.pa_key = '{pa_key}' AND cm.metric_seq = {richness_metric_seq} ")) if (!is.na(result$avg_value[1])) { dbExecute(con_sdm, glue(" INSERT INTO region_metric (rgn_seq, metric_seq, value) VALUES ({rgn_seq}, {richness_metric_seq}, {result$avg_value[1]}) ")) }}# query resultspa_metrics <- dbGetQuery(con_sdm, "SELECT p.pa_key, p.pa_name, rm.value as avg_richnessFROM region rJOIN region_metric rm ON r.rgn_seq = rm.rgn_seqJOIN planning_areas p ON r.value = p.pa_keyWHERE r.tbl_col = 'planning_areas.pa_key' AND rm.metric_seq = (SELECT metric_seq FROM metric WHERE metric_type = 'species_richness')ORDER BY rm.value DESC")print(pa_metrics)```## Add Environmental Layers```{r add_env_layers}#| eval: false# example: add primary productivity from ingest_productivity.qmdnpp_tif <- here("data/derived/vgpm-viirs_2023-daily-avg.tif")if (file.exists(npp_tif)) { r_npp <- rast(npp_tif) # resample to 0.05 degree resolution r_template <- rast( extent = ext(-180, 180, -90, 90), resolution = 0.05, crs = "EPSG:4326" ) r_npp_res <- resample(r_npp, r_template, method = "bilinear") # extract values npp_df <- as.data.frame(r_npp_res, xy = TRUE) |> filter(!is.na(npp_daily_avg)) |> mutate( cell_lon = round(x / 0.05) * 0.05, cell_lat = round(y / 0.05) * 0.05 ) # insert into env_layers # (similar process as species values)}```## Export to Parquet```{r export_parquet}#| eval: false# export tables to parquet for efficient storageoutput_dir <- glue("{dir_data}/derived/sdm_parquet")dir_create(output_dir)# export each tabletables_to_export <- c( "dataset", "model", "species", "cell", "model_cell", "metric", "cell_metric", "region", "region_metric", "region_cell", "planning_areas")for (tbl in tables_to_export) { message(glue("Exporting {tbl} to parquet...")) df <- dbGetQuery(con_sdm, glue("SELECT * FROM {tbl}")) arrow::write_parquet( df, file.path(output_dir, glue("{tbl}.parquet")), compression = "snappy" )}```## Close Connection```{r cleanup}dbDisconnect(con_sdm)```