Ingest AquaMaps to SDM DuckDB

Transform downscaled AquaMaps data into DuckDB for efficient biodiversity analysis

Published

2025-05-27 20:28:46

1 Overview

This notebook ingests downscaled AquaMaps species distribution models (0.05° resolution) into a DuckDB database (msens_sdm) for efficient querying and biodiversity metric calculation. The database schema supports multiple SDM sources and includes spatial functions for raster generation and polygon summarization.

Code
librarian::shelf(
  DBI, dbplyr, dplyr, duckdb, fs, glue, here, leaflet, mapgl, purrr, 
  sf, terra, tibble, tidyr, quiet = T )

source(here("libs/db.R"))
source(here("libs/am_functions.R"))
source(here("libs/sdm_functions.R"))

# set up directories
is_server <-  Sys.info()[["sysname"]] == "Linux"
dir_data <- ifelse(
  is_server,
  "/share/data",
  "~/My Drive/projects/msens/data")

# database setup
db_path <- glue("{dir_data}/derived/msens_sdm.duckdb")
con_sdm <- dbConnect(duckdb(), dbdir = db_path, read_only = FALSE)
# dbDisconnect(con_sdm, shutdown = T)
# duckdb_shutdown(duckdb())

# Check if spatial extension is already loaded
check_spatial <- function(con) {
  tryCatch({
    # Try to use a spatial function - if it works, extension is loaded
    res <- dbGetQuery(con, "SELECT ST_Point(0, 0)")
    return(TRUE)
  }, error = function(e) {
    return(FALSE)
  })
}
# Only install/load if not already available
if (!check_spatial(con_sdm)) {
  res <- dbExecute(con_sdm, "INSTALL spatial; LOAD spatial;")
}

2 Database Schema

The database schema supports multiple species per model and includes biodiversity metrics with flexible regional aggregation.

erDiagram
%% https://mermaid.js.org/syntax/entityRelationshipDiagram.html

%% tables
dataset {
  str ds_key          PK  "dataset key"
  str name_short          "short name: {source_broad} {taxa_groups} {response_type} in {regions}, {year_pub}"
  str name_original       "original dataset name"
  str description
  str citation
  str source_broad        "e.g. NOAA, AquaMaps, Duke, NCCOS"
  str source_detail       "e.g. NOAA Southeast Fisheries Science Center"
  str regions             "e.g. Global, Atlantic, Pacific, Gulf of America"
  str response_type       "occurrence, range, suitability, biomass, density, diversity"
  str taxa_groups         "fish, invertebrates, marine mammals, cetaceans, sea turtles, seabirds"
  int year_pub            "year published"
  date date_obs_beg       "first observation date"
  date date_obs_end       "last observation date"
  date date_env_beg       "first environmental data date"
  date date_env_end       "last environmental data date"
  str link_info           "primary information URL"
  str link_download       "download URL"
  str link_metadata       "metadata URL"
  str links_other         "other URLs space-separated"
  dbl spatial_res_deg     "spatial resolution in decimal degrees"
  str temporal_res        "temporal resolution"
  date date_created       "date added to database"
}

model {
  int mdl_seq         PK  "sequential model ID"
  str ds_key          FK  "dataset key"
  str taxa                "taxonomic identifier"
  str time_period         "time period"
  str region              "geographic region"
  str mdl_type            "occurrence, range, suitability, biomass, density, diversity"
  str description
  date date_created       "date added to database"
}

species {
  int sp_id           PK  "species ID"
  str ds_key          FK  "dataset key"
  str taxa            FK  "links to model.taxa"
  str sp_key              "species key"
  int aphia_id            "WoRMS Aphia ID"
  int gbif_id             "GBIF ID"
  int itis_id             "ITIS ID"
  str scientific_name
  str common_name
  str sp_cat              "fish, mollusk, crustacean, etc."
}

cell {
  int cell_id         PK  "cell ID"
  dbl lon                 "longitude"
  dbl lat                 "latitude"
  dbl area_km2            "cell area in square kilometers"
  geo geom                "cell geometry"
}

model_cell {
  int cell_id         FK  "cell ID"
  int mdl_seq         FK  "model sequential ID"
  dbl value               "model value"
}

metric {
  int metric_seq      PK  "auto-incrementing metric ID"
  str metric_type         "species_richness, shannon_index, etc."
  str description
  date date_created       "date metric was created"
}

cell_metric {
  int cell_id         FK  "cell ID"
  int metric_seq      FK  "metric ID"
  dbl value               "calculated metric value"
}

region {
  int rgn_seq         PK  "auto-incrementing region ID"
  str tbl_col             "{table}.{column} e.g. planning_areas.pa_key"
  str value               "unique identifier value e.g. CHU"
  str col_geom            "geometry column name"
  date date_created       "date added to database"
}

region_metric {
  int rgn_seq         FK  "region ID"
  int metric_seq      FK  "metric ID"
  dbl value               "calculated metric value"
}

region_cell {
  int rgn_seq         FK  "region ID"
  int cell_id         FK  "cell ID"
  int pct_covered         "percentage of cell covered by region"
}

planning_areas {
  str pa_key          PK  "planning area key"
  str pa_name
  geo geom                "planning area geometry"
}

%% relationships
dataset ||--o{ model : ds_key
dataset ||--o{ species : ds_key
model ||--|{ species : "ds_key,taxa"
model ||--o{ model_cell : mdl_seq
cell ||--o{ model_cell : cell_id
cell ||--o{ cell_metric : cell_id
cell ||--o{ region_cell : cell_id
metric ||--o{ cell_metric : metric_seq
metric ||--o{ region_metric : metric_seq
region ||--o{ region_metric : rgn_seq
region ||--o{ region_cell : rgn_seq

3 Create Database Tables

Code
# drop existing tables if needed
if (dbExistsTable(con_sdm, "model_cell") || dbExistsTable(con_sdm, "sdm_values")) {
  tables_to_drop <- c(
    "region_metric", "region_cell", "cell_metric", "env_layers", 
    "model_cell", "sdm_values", "species", "model", "dataset", 
    "cell", "sdm_cells", "metric", "region", "planning_areas",
    "biodiv_metrics", "planning_area_metrics", "sdm_species", 
    "sdm_models", "sdm_sources"
  )
  
  for (tbl in tables_to_drop) {
    if (dbExistsTable(con_sdm, tbl))
      dbExecute(con_sdm, glue("DROP TABLE {tbl} CASCADE"))
  }
}

# create tables
dbExecute(con_sdm, "
CREATE TABLE dataset (
  ds_key VARCHAR PRIMARY KEY,
  name_short VARCHAR NOT NULL,
  name_original VARCHAR,
  description TEXT,
  citation TEXT,
  source_broad VARCHAR,
  source_detail VARCHAR,
  regions VARCHAR,
  response_type VARCHAR,
  taxa_groups VARCHAR,
  year_pub INTEGER,
  date_obs_beg DATE,
  date_obs_end DATE,
  date_env_beg DATE,
  date_env_end DATE,
  link_info VARCHAR,
  link_download VARCHAR,
  link_metadata VARCHAR,
  links_other VARCHAR,
  spatial_res_deg DOUBLE,
  temporal_res VARCHAR,
  date_created DATE DEFAULT CURRENT_DATE
)")

dbExecute(con_sdm, "
CREATE SEQUENCE model_mdl_seq_seq START 1")

dbExecute(con_sdm, "
CREATE TABLE model (
  mdl_seq INTEGER PRIMARY KEY DEFAULT nextval('model_mdl_seq_seq'),
  ds_key VARCHAR REFERENCES dataset(ds_key),
  taxa VARCHAR,
  time_period VARCHAR,
  region VARCHAR,
  mdl_type VARCHAR,
  description TEXT,
  date_created DATE DEFAULT CURRENT_DATE
)")

dbExecute(con_sdm, "
CREATE TABLE species (
  sp_id INTEGER PRIMARY KEY,
  ds_key VARCHAR REFERENCES dataset(ds_key),
  taxa VARCHAR,
  sp_key VARCHAR NOT NULL,
  aphia_id INTEGER,
  gbif_id INTEGER,
  itis_id INTEGER,
  scientific_name VARCHAR,
  common_name VARCHAR,
  sp_cat VARCHAR
)")

dbExecute(con_sdm, "
CREATE TABLE cell (
  cell_id INTEGER PRIMARY KEY,
  lon DOUBLE NOT NULL,
  lat DOUBLE NOT NULL,
  area_km2 DOUBLE NOT NULL,
  geom GEOMETRY
)")

dbExecute(con_sdm, "
CREATE TABLE model_cell (
  cell_id INTEGER REFERENCES cell(cell_id),
  mdl_seq INTEGER REFERENCES model(mdl_seq),
  value DOUBLE,
  PRIMARY KEY (cell_id, mdl_seq)
)")

dbExecute(con_sdm, "
CREATE SEQUENCE metric_metric_seq_seq START 1")

dbExecute(con_sdm, "
CREATE TABLE metric (
  metric_seq INTEGER PRIMARY KEY DEFAULT nextval('metric_metric_seq_seq'),
  metric_type VARCHAR NOT NULL,
  description TEXT,
  date_created DATE DEFAULT CURRENT_DATE
)")

dbExecute(con_sdm, "
CREATE TABLE cell_metric (
  cell_id INTEGER REFERENCES cell(cell_id),
  metric_seq INTEGER REFERENCES metric(metric_seq),
  value DOUBLE,
  PRIMARY KEY (cell_id, metric_seq)
)")

dbExecute(con_sdm, "
CREATE SEQUENCE region_rgn_seq_seq START 1")

dbExecute(con_sdm, "
CREATE TABLE region (
  rgn_seq INTEGER PRIMARY KEY DEFAULT nextval('region_rgn_seq_seq'),
  tbl_col VARCHAR NOT NULL,
  value VARCHAR NOT NULL,
  col_geom VARCHAR NOT NULL,
  date_created DATE DEFAULT CURRENT_DATE
)")

dbExecute(con_sdm, "
CREATE TABLE region_metric (
  rgn_seq INTEGER REFERENCES region(rgn_seq),
  metric_seq INTEGER REFERENCES metric(metric_seq),
  value DOUBLE,
  PRIMARY KEY (rgn_seq, metric_seq)
)")

dbExecute(con_sdm, "
CREATE TABLE region_cell (
  rgn_seq INTEGER REFERENCES region(rgn_seq),
  cell_id INTEGER REFERENCES cell(cell_id),
  pct_covered INTEGER,
  PRIMARY KEY (rgn_seq, cell_id)
)")

dbExecute(con_sdm, "
CREATE TABLE planning_areas (
  pa_key VARCHAR PRIMARY KEY,
  pa_name VARCHAR,
  geom GEOMETRY
)")

# create indexes
dbExecute(con_sdm, "CREATE INDEX idx_model_cell_cell_id ON model_cell(cell_id)")
dbExecute(con_sdm, "CREATE INDEX idx_model_cell_mdl_seq ON model_cell(mdl_seq)")
dbExecute(con_sdm, "CREATE INDEX idx_cell_coords ON cell(lon, lat)")
dbExecute(con_sdm, "CREATE INDEX idx_species_taxa ON species(ds_key, taxa)")
dbExecute(con_sdm, "CREATE INDEX idx_model_taxa ON model(ds_key, taxa)")
dbExecute(con_sdm, "CREATE INDEX idx_region_cell_rgn ON region_cell(rgn_seq)")
dbExecute(con_sdm, "CREATE INDEX idx_region_cell_cell ON region_cell(cell_id)")
dbExecute(con_sdm, "CREATE SPATIAL INDEX idx_cell_geom ON cell(geom)")
dbExecute(con_sdm, "CREATE SPATIAL INDEX idx_planning_areas_geom ON planning_areas(geom)")

4 Ingest AquaMaps Source

Code
# add AquaMaps as a source
aquamaps_dataset <- tibble(
  ds_key          = "am_0.05",
  name_short      = "AquaMaps fish suitability in Global, 2023",
  name_original   = "AquaMaps: Predicted range maps for aquatic species",
  description     = "AquaMaps species distribution models downscaled to 0.05° using Bio-Oracle v3 environmental layers",
  citation        = "Kaschner et al. (2023) AquaMaps: Predicted range maps for aquatic species. Version 10/2019. www.aquamaps.org",
  source_broad    = "AquaMaps",
  source_detail   = "AquaMaps.org collaborative project",
  regions         = "Global",
  response_type   = "suitability",
  taxa_groups     = "fish",
  year_pub        = 2023,
  date_obs_beg    = as.Date("1990-01-01"),
  date_obs_end    = as.Date("2020-12-31"),
  date_env_beg    = as.Date("2000-01-01"),
  date_env_end    = as.Date("2020-12-31"),
  link_info       = "https://www.aquamaps.org",
  link_download   = "https://www.aquamaps.org/download/main.php",
  link_metadata   = "https://www.aquamaps.org/main/alg_2019.php",
  links_other     = "https://www.fishbase.org https://www.sealifebase.org",
  spatial_res_deg = 0.05,
  temporal_res    = "static"
)

dbWriteTable(con_sdm, "dataset", aquamaps_dataset, append = TRUE)

5 Load Planning Areas

Code
# read planning areas from existing database
ply_pa <- st_read(con, "ply_planareas_2025")

# simplify and load into DuckDB
planning_areas <- ply_pa |>
  select(pa_key = planarea_key, pa_name = planarea_name, geom) |> # TODO: keep original
  st_transform(4326)

# write to DuckDB (convert geometry to WKT first)
pa_df <- planning_areas |>
  mutate(geom_wkt = st_as_text(geom)) |>
  st_drop_geometry()

dbExecute(con_sdm, "DELETE FROM planning_areas")
for (i in 1:nrow(pa_df)) {
  dbExecute(con_sdm, glue("
    INSERT INTO planning_areas (pa_key, pa_name, geom)
    VALUES ('{pa_df$pa_key[i]}', '{pa_df$pa_name[i]}', 
            ST_GeomFromText('{pa_df$geom_wkt[i]}'))
  "))
}

6 Create 0.05° Grid Cells

Code
# create global 0.05 degree raster template
res <- 0.05
r_global <- terra::rast(
  xmin = -180, xmax = 180,
  ymin = -90,  ymax = 90,
  resolution = res,
  crs = "EPSG:4326"
)

# get cell IDs and coordinates using terra
cell_ids <- terra::cells(r_global)
coords <- terra::xyFromCell(r_global, cell_ids)

# calculate cell areas in km2
# for 0.05 degree cells, area varies by latitude
cell_areas <- terra::cellSize(r_global, unit = "km")
areas_km2 <- terra::values(cell_areas)[cell_ids]

# create grid cells dataframe
grid_cells <- tibble(
  cell_id  = cell_ids,
  lon      = coords[, 1],
  lat      = coords[, 2],
  area_km2 = areas_km2
)

# batch insert cells
batch_size <- 10000
n_batches <- ceiling(nrow(grid_cells) / batch_size)

message(glue("Inserting {nrow(grid_cells)} grid cells in {n_batches} batches..."))

for (i in 1:n_batches) {
  start_idx <- (i - 1) * batch_size + 1
  end_idx <- min(i * batch_size, nrow(grid_cells))
  
  batch <- grid_cells[start_idx:end_idx, ]
  
  # create WKT points for geometries
  batch <- batch |>
    mutate(
      geom_wkt = glue("POINT({lon} {lat})")
    )
  
  # insert batch
  for (j in 1:nrow(batch)) {
    dbExecute(con_sdm, glue("
      INSERT INTO cell (cell_id, lon, lat, area_km2, geom)
      VALUES ({batch$cell_id[j]}, {batch$lon[j]}, {batch$lat[j]}, 
              {batch$area_km2[j]}, ST_GeomFromText('{batch$geom_wkt[j]}'))
    "))
  }
  
  if (i %% 10 == 0) {
    message(glue("  Completed batch {i}/{n_batches}"))
  }
}

# save the global raster template for later use
saveRDS(r_global, glue("{dir_data}/derived/r_global_0.05.rds"))

7 Process Aquamaps downsized models

Code
# this chunk assumes the iterate_species_in_planareas chunk from 
# ingest_aquamaps_res05.qmd has been run and produced species rasters

# get list of species rasters
dir_tifs <- glue("{dir_data}/derived/aquamaps.org/spp_cells0.05_cog")
spp_tifs <- list.files(species_dir, pattern = "*.tif$", full.names = TRUE)

# load global raster template
r_global <- readRDS(glue("{dir_data}/derived/r_global_0.05.rds"))

# process each species
sp_id_counter <- 1

for (sp_tif in spp_tifs[1:10]) { # process first 10 for testing
  
  sp_key <- basename(sp_file) |> path_ext_remove()
  message(glue("Processing species: {sp_key}"))
  
  # read species raster
  r_sp <- rast(sp_tif)
  
  # get species info from original database
  sp_info <- tbl(con, "spp") |>
    filter(sp_key == !!sp_key) |>
    collect()

  # create model for this species (one model per species for AquaMaps)
  model_info <- tibble(
    ds_key      = "am_0.05",
    taxa        = sp_key,  # for AquaMaps, taxa equals sp_key
    time_period = "2019",
    region      = "Global",
    mdl_type    = "suitability",
    description = glue(
      "AquaMaps suitability for {sp_info$genus[1]} {sp_info$species[1]}, interpolated to 0.05° resolution") )
  
  dbWriteTable(con_sdm, "model", model_info, append = TRUE)
  
  # get the mdl_seq that was just created
  mdl_seq <- dbGetQuery(con_sdm, glue("
    SELECT mdl_seq FROM model 
    WHERE ds_key = '{model_info$ds_key}' 
      AND taxa = '{model_info$taxa}'
    ORDER BY mdl_seq DESC LIMIT 1 "))$mdl_seq
  
  # determine species category
  sp_cat <- case_when(
    sp_info$class[1] == "Actinopterygii" ~ "fish",
    sp_info$class[1] == "Chondrichthyes" ~ "fish",
    sp_info$class[1] == "Mammalia" ~ "marine mammal",
    sp_info$class[1] == "Reptilia" ~ "sea turtle",
    sp_info$phylum[1] == "Mollusca" ~ "mollusk",
    sp_info$phylum[1] == "Arthropoda" && sp_info$subphylum[1] == "Crustacea" ~ "crustacean",
    TRUE ~ "other"
  )
  
  # add species to database
  species_info <- tibble(
    sp_id           = sp_id_counter,
    ds_key          = "am_0.05",
    taxa            = sp_key,
    sp_key          = sp_key,
    aphia_id        = sp_info$aphia_id[1],
    gbif_id         = NA_integer_,  # TODO: lookup from GBIF
    itis_id         = NA_integer_,  # TODO: lookup from ITIS
    scientific_name = glue("{sp_info$genus[1]} {sp_info$species[1]}"),
    common_name     = sp_info$common_name[1],
    sp_cat          = sp_cat)
  
  dbWriteTable(con_sdm, "species", species_info, append = TRUE)
  
  # extract values using terra::cells to ensure matching
  # get cell IDs that have values
  cell_values <- terra::values(r_sp, mat = FALSE, dataframe = TRUE, na.rm = TRUE)
  cell_numbers <- terra::cells(r_sp)[!is.na(terra::values(r_sp))]
  
  # create model_cell records
  model_cell_df <- tibble(
    cell_id = cell_numbers,
    mdl_seq = mdl_seq,
    value   = cell_values[[1]] / 100  # convert from percentage to probability
  )
  
  # batch insert values
  batch_insert_values(con_sdm, model_cell_df, "model_cell")
  
  sp_id_counter <- sp_id_counter + 1
}

8 Calculate Biodiversity Metrics

Code
# first create metric definitions
metrics_to_create <- tibble::tribble(
  ~metric_type,         ~description,
  "species_richness",   "Number of species with suitability > 0.5",
  "shannon_index",      "Shannon diversity index based on suitability values",
  "simpson_index",      "Simpson diversity index based on suitability values"
)

for (i in 1:nrow(metrics_to_create)) {
  dbExecute(con_sdm, glue("
    INSERT INTO metric (metric_type, description)
    VALUES ('{metrics_to_create$metric_type[i]}', '{metrics_to_create$description[i]}')
  "))
}

# get metric IDs
metric_ids <- dbGetQuery(con_sdm, "SELECT metric_seq, metric_type FROM metric")

# calculate species richness per cell
message("Calculating species richness...")

richness_metric_seq <- metric_ids$metric_seq[metric_ids$metric_type == "species_richness"]

dbExecute(con_sdm, glue("
INSERT INTO cell_metric (cell_id, metric_seq, value)
SELECT 
  mc.cell_id,
  {richness_metric_seq} as metric_seq,
  COUNT(DISTINCT m.taxa) as value
FROM model_cell mc
JOIN model m ON mc.mdl_seq = m.mdl_seq
WHERE mc.value > 0.5  -- presence threshold
GROUP BY mc.cell_id
"))

# calculate Shannon diversity index
message("Calculating Shannon diversity...")

shannon_metric_seq <- metric_ids$metric_seq[metric_ids$metric_type == "shannon_index"]

dbExecute(con_sdm, glue("
INSERT INTO cell_metric (cell_id, metric_seq, value)
SELECT 
  cell_id,
  {shannon_metric_seq} as metric_seq,
  -SUM(p * LN(p)) as value
FROM (
  SELECT 
    mc.cell_id,
    mc.value / SUM(mc.value) OVER (PARTITION BY mc.cell_id) as p
  FROM model_cell mc
  WHERE mc.value > 0
) t
WHERE p > 0
GROUP BY cell_id
"))

9 Generate Biodiversity Raster

Code
# load global raster template
r_global <- readRDS(glue("{dir_data}/derived/r_global_0.05.rds"))

# get species richness metric ID
richness_metric_seq <- dbGetQuery(con_sdm, "
  SELECT metric_seq FROM metric WHERE metric_type = 'species_richness'
")$metric_seq

# query species richness for a region
richness_df <- dbGetQuery(con_sdm, glue("
SELECT 
  cm.cell_id,
  cm.value as species_richness
FROM cell_metric cm
WHERE cm.metric_seq = {richness_metric_seq}
  AND cm.cell_id IN (
    SELECT cell_id FROM cell 
    WHERE lon BETWEEN -180 AND -60  -- limit to a region for testing
      AND lat BETWEEN 20 AND 50
  )
"))

# convert to raster using the new function
r_richness <- create_raster_from_df(
  richness_df, 
  r_template = r_global,
  value_cols = "species_richness"
)

# display with mapgl
mapgl_rast <- function(r, palette = "viridis", layer_name = "SDM") {
  # convert raster to data frame
  r_df <- as.data.frame(r, xy = TRUE) |>
    rename(value = 3)
  
  # create mapgl visualization
  mapgl() |>
    add_raster_tile_layer(
      id = "sdm_raster",
      source = r,
      source_layer = layer_name,
      color_palette = palette
    )
}

# visualize
mapgl_rast(r_richness, palette = "YlOrRd", layer_name = "Species Richness")

10 Summarize by Planning Area

Code
# first, register planning areas as regions
planning_area_regions <- dbGetQuery(con_sdm, "
  SELECT pa_key, pa_name FROM planning_areas
")

for (i in 1:nrow(planning_area_regions)) {
  dbExecute(con_sdm, glue("
    INSERT INTO region (tbl_col, value, col_geom)
    VALUES ('planning_areas.pa_key', '{planning_area_regions$pa_key[i]}', 'geom')
  "))
}

# get region and metric IDs
region_ids <- dbGetQuery(con_sdm, "
  SELECT rgn_seq, value FROM region WHERE tbl_col = 'planning_areas.pa_key'
")

richness_metric_seq <- dbGetQuery(con_sdm, "
  SELECT metric_seq FROM metric WHERE metric_type = 'species_richness'
")$metric_seq

# calculate average species richness per planning area
message("Calculating planning area metrics...")

for (i in 1:nrow(region_ids)) {
  pa_key <- region_ids$value[i]
  rgn_seq <- region_ids$rgn_seq[i]
  
  # calculate metric for this region
  result <- dbGetQuery(con_sdm, glue("
    SELECT AVG(cm.value) as avg_value
    FROM planning_areas p
    JOIN cell c ON ST_Within(c.geom, p.geom)
    JOIN cell_metric cm ON c.cell_id = cm.cell_id
    WHERE p.pa_key = '{pa_key}'
      AND cm.metric_seq = {richness_metric_seq}
  "))
  
  if (!is.na(result$avg_value[1])) {
    dbExecute(con_sdm, glue("
      INSERT INTO region_metric (rgn_seq, metric_seq, value)
      VALUES ({rgn_seq}, {richness_metric_seq}, {result$avg_value[1]})
    "))
  }
}

# query results
pa_metrics <- dbGetQuery(con_sdm, "
SELECT 
  p.pa_key,
  p.pa_name,
  rm.value as avg_richness
FROM region r
JOIN region_metric rm ON r.rgn_seq = rm.rgn_seq
JOIN planning_areas p ON r.value = p.pa_key
WHERE r.tbl_col = 'planning_areas.pa_key'
  AND rm.metric_seq = (SELECT metric_seq FROM metric WHERE metric_type = 'species_richness')
ORDER BY rm.value DESC
")

print(pa_metrics)

11 Add Environmental Layers

Code
# example: add primary productivity from ingest_productivity.qmd
npp_tif <- here("data/derived/vgpm-viirs_2023-daily-avg.tif")

if (file.exists(npp_tif)) {
  r_npp <- rast(npp_tif)
  
  # resample to 0.05 degree resolution
  r_template <- rast(
    extent = ext(-180, 180, -90, 90),
    resolution = 0.05,
    crs = "EPSG:4326"
  )
  
  r_npp_res <- resample(r_npp, r_template, method = "bilinear")
  
  # extract values
  npp_df <- as.data.frame(r_npp_res, xy = TRUE) |>
    filter(!is.na(npp_daily_avg)) |>
    mutate(
      cell_lon = round(x / 0.05) * 0.05,
      cell_lat = round(y / 0.05) * 0.05
    )
  
  # insert into env_layers
  # (similar process as species values)
}

12 Export to Parquet

Code
# export tables to parquet for efficient storage
output_dir <- glue("{dir_data}/derived/sdm_parquet")
dir_create(output_dir)

# export each table
tables_to_export <- c(
  "dataset", "model", "species", 
  "cell", "model_cell", "metric",
  "cell_metric", "region", "region_metric", "region_cell",
  "planning_areas"
)

for (tbl in tables_to_export) {
  message(glue("Exporting {tbl} to parquet..."))
  
  df <- dbGetQuery(con_sdm, glue("SELECT * FROM {tbl}"))
  
  arrow::write_parquet(
    df, 
    file.path(output_dir, glue("{tbl}.parquet")),
    compression = "snappy"
  )
}

13 Close Connection

Code
dbDisconnect(con_sdm)