Investigate Rice’s Whale DB Corruption

Diagnose spatial data corruption in model_cell for Rice’s whale and other species

Published

2026-02-14 19:34:24

1 Overview

Rice’s whale (taxon_id=1576133) rng_iucn shows only ~1,969 cells (identical to ca_nmfs), but the correct IUCN polygon covers ~49,500 km² and should produce ~3,000-4,000 cells. The Critical Habitat (ch_nmfs) has 2,677 cells — more than the supposedly-larger IUCN range, which is backwards. This notebook diagnoses when/how the corruption occurred by comparing database backups and raw source files.

2 Setup

Code
librarian::shelf(
  DBI,
  dplyr,
  duckdb,
  glue,
  here,
  leaflet,
  purrr,
  sf,
  terra,
  tibble,
  quiet = T
)
source(here("libs/paths.R"))

cell_tif <- glue("{dir_derived}/r_bio-oracle_planarea.tif")
r_cell <- rast(cell_tif, lyrs = "cell_id")

3 Connect to Database Versions

Code
dir_archive <- glue("{dir_big}/archive_2026-02-12")

# define database paths (read-only)
db_paths <- c(
  "Jan 29" = glue("{dir_archive}/sdm_2026-01-29.duckdb"),
  "Feb 10" = glue("{dir_archive}/sdm_2026-02-10.duckdb"),
  "Feb 12 (v3 backup)" = glue("{dir_archive}/sdm_v3_2026-02-12.duckdb"),
  "v3 current" = glue("{dir_big}/v3/sdm.duckdb"),
  "v3b" = glue("{dir_big}/v3b/sdm.duckdb")
)

# filter to paths that actually exist
db_paths <- db_paths[file.exists(db_paths)]
message(glue(
  "Found {length(db_paths)} databases: {paste(names(db_paths), collapse = ', ')}"
))
Found 5 databases: Jan 29, Feb 10, Feb 12 (v3 backup), v3 current, v3b
Code
# open read-only connections
cons <- map(db_paths, \(p) dbConnect(duckdb(dbdir = p, read_only = TRUE)))
names(cons) <- names(db_paths)

4 Rice’s Whale model_cell Comparison Across DBs

Code
taxon_id_ricei <- 1576133L

d_compare <- map_dfr(names(cons), \(db_name) {
  con <- cons[[db_name]]

  # check if taxon_model table exists
  if (!dbExistsTable(con, "taxon_model")) {
    return(tibble())
  }

  tryCatch(
    {
      tbl(con, "taxon_model") |>
        filter(taxon_id == !!taxon_id_ricei) |>
        select(taxon_id, mdl_seq) |>
        inner_join(
          tbl(con, "model") |> select(mdl_seq, ds_key),
          by = "mdl_seq"
        ) |>
        inner_join(
          tbl(con, "model_cell") |>
            group_by(mdl_seq) |>
            summarize(
              n_cells = n(),
              v_min = min(value, na.rm = TRUE),
              v_max = max(value, na.rm = TRUE),
              .groups = "drop"
            ),
          by = "mdl_seq"
        ) |>
        collect() |>
        mutate(db_version = db_name)
    },
    error = \(e) {
      message(glue("Error querying {db_name}: {e$message}"))
      tibble()
    }
  )
})

if (nrow(d_compare) > 0) {
  d_compare |>
    select(db_version, ds_key, mdl_seq, n_cells, v_min, v_max) |>
    arrange(db_version, ds_key) |>
    knitr::kable(caption = "Rice's whale model_cell summary across DB versions")
} else {
  message("No data found for Rice's whale in any database")
}
Rice’s whale model_cell summary across DB versions
db_version ds_key mdl_seq n_cells v_min v_max
Feb 10 ca_nmfs 23004 1969 100 100
Feb 10 ch_nmfs 18232 2677 100 100
Feb 10 ms_merge 25416 3617 100 100
Feb 10 rng_iucn 23505 1969 50 50
Feb 10 rng_iucn 23505 1969 50 50
Feb 12 (v3 backup) ca_nmfs 23004 1969 100 100
Feb 12 (v3 backup) ch_nmfs 18232 2677 100 100
Feb 12 (v3 backup) ms_merge 25416 3617 100 100
Feb 12 (v3 backup) rng_iucn 23505 1969 50 50
v3 current ca_nmfs 23004 1969 100 100
v3 current ch_nmfs 18232 2677 100 100
v3 current ms_merge 25416 3617 100 100
v3 current rng_iucn 23505 1969 50 50
v3b ca_nmfs 23004 1969 100 100
v3b ch_nmfs 18232 2677 100 100
v3b ms_endemism 75452 1676 43 43
v3b ms_er 95087 3617 100 100
v3b ms_merge 104903 1676 43 43
v3b ms_suit 85268 1676 43 43
v3b rng_iucn 23505 1969 50 50

5 Visualize Rice’s Whale Spatial Layers

Map cell_id values back to raster for each layer from the current v3 database.

Code
con_v3 <- cons[["v3 current"]]

# get all Rice's whale layers
d_layers <- tbl(con_v3, "taxon_model") |>
  filter(taxon_id == !!taxon_id_ricei) |>
  select(taxon_id, mdl_seq) |>
  inner_join(
    tbl(con_v3, "model") |> select(mdl_seq, ds_key),
    by = "mdl_seq"
  ) |>
  collect()

# rasterize each layer
r_layers <- map(1:nrow(d_layers), \(i) {
  lyr <- d_layers[i, ]
  d_mc <- tbl(con_v3, "model_cell") |>
    filter(mdl_seq == !!lyr$mdl_seq) |>
    collect()

  r <- init(r_cell[[1]], NA)
  r[d_mc$cell_id] <- d_mc$value
  names(r) <- lyr$ds_key
  r
})
names(r_layers) <- d_layers$ds_key

# display each layer
for (nm in names(r_layers)) {
  r <- trim(r_layers[[nm]])
  message(glue("{nm}: {ncell(r[!is.na(r)])} non-NA cells"))
  plot(r, main = glue("Rice's whale — {nm}"))
}
ch_nmfs: 2677 non-NA cells

ca_nmfs: 1969 non-NA cells

rng_iucn: 1969 non-NA cells

ms_merge: 3617 non-NA cells

5.1 Interactive map comparison

Code
# combine all layers into a single stack for comparison
r_all <- map(r_layers, \(r) trim(r)) |>
  purrr::reduce(\(a, b) {
    # extend to common extent
    e <- ext(
      min(ext(a)[1], ext(b)[1]),
      max(ext(a)[2], ext(b)[2]),
      min(ext(a)[3], ext(b)[3]),
      max(ext(a)[4], ext(b)[4])
    )
    extend(a, e) |>
      c(extend(b, e))
  })

# plot the first available layer interactively
if (length(r_layers) > 0) {
  r_plot <- trim(r_layers[[1]])
  plet(r_plot, main = names(r_layers)[1])
}

6 Compare DB Cells to Raw Source Polygons

6.1 Load raw shapefiles

Code
# iucn range polygon
iucn_shp <- glue(
  "{dir_raw}/iucnredlist.org/MAMMALS_MARINE_ONLY/MAMMALS_MARINE_ONLY.shp"
)
if (file.exists(iucn_shp)) {
  sf_iucn <- st_read(iucn_shp, quiet = TRUE) |>
    filter(sci_name == "Balaenoptera ricei")
  message(glue(
    "IUCN polygon: {nrow(sf_iucn)} features, ",
    "area = {round(sum(st_area(sf_iucn)) / 1e6)} km²"
  ))
} else {
  message(glue("IUCN shapefile not found: {iucn_shp}"))
  sf_iucn <- NULL
}
IUCN polygon: 1 features, area = 49485 km²
Code
# nmfs core area polygon
core_dir <- glue(
  "{dir_raw}/fisheries.noaa.gov/core-areas/",
  "shapefile_Rices_whale_core_distribution_area_Jun19_SERO"
)
core_shp <- glue(
  "{core_dir}/",
  "shapefile_Rices_whale_core_distribution_area_Jun19_SERO.shp"
)
if (file.exists(core_shp)) {
  sf_core <- st_read(core_shp, quiet = TRUE)
  message(glue(
    "NMFS Core Area: {nrow(sf_core)} features, ",
    "area = {round(sum(st_area(sf_core)) / 1e6)} km²"
  ))
} else {
  message(glue("Core area shapefile not found: {core_shp}"))
  sf_core <- NULL
}
NMFS Core Area: 1 features, area = 49485 km²
Code
# critical habitat from gdb
ch_gdb <- glue(
  "{dir_raw}/fisheries.noaa.gov/",
  "NMFS_ESA_Critical_Habitat_20230505_2025-07-17.gdb"
)
if (file.exists(ch_gdb)) {
  # list layers to find Rice's whale critical habitat
  lyrs <- st_layers(ch_gdb)
  message(glue("GDB layers: {paste(lyrs$name, collapse = ', ')}"))
  # try to find the layer containing Rice's whale
  # (user may need to adjust layer name)
  ricei_lyr <- lyrs$name[grepl(
    "Rice|Bryde|Balaenoptera",
    lyrs$name,
    ignore.case = TRUE
  )]
  if (length(ricei_lyr) > 0) {
    sf_ch <- st_read(ch_gdb, layer = ricei_lyr[1], quiet = TRUE)
    message(glue("CH layer '{ricei_lyr[1]}': {nrow(sf_ch)} features"))
  } else {
    message("No Rice's whale layer found in GDB — listing all layers above")
    sf_ch <- NULL
  }
} else {
  message(glue("Critical Habitat GDB not found: {ch_gdb}"))
  sf_ch <- NULL
}
GDB layers: AbaloneBlack_20111027, Bocaccio_PugetSoundGeorgiaBasinDPS_20141113, CoralElkhorn_20081126, CoralStaghorn_20081126, RockfishYelloweye_PugetSoundGeorgiaBasinDPS_20141113, SalmonAtlantic_GulfofMaineDPS_20090619, SalmonChinook_PugetSoundESU_20050902_poly, SalmonChinook_SacramentoRiverwinterrunESU_19930616_poly, SalmonChum_HoodCanalsummerrunESU_20050902_poly, SalmonSockeye_SnakeRiverESU_19931228_poly, SawfishSmalltooth_USDPS_20090902, SealHawaiianMonk_20150821_poly, SeaLionSteller_WesternDPS_19940615, SeaTurtleGreen_NorthAtlanticDPS_19980902, SeaTurtleHawksbill_19980902, SeaTurtleLeatherback_20120126, SeaTurtleLoggerhead_NorthwestAtlanticOceanDPS_20140710, SturgeonAtlantic_GulfSubspecies_20030319_poly, SturgeonGreen_SouthernDPS_20091009_poly, WhaleBeluga_CookInletDPS_20110411, WhaleFalseKiller_MainHawaiianIslandsInsularDPS_20180724, WhaleHumpback_CentralAmericaDPS_20210421, WhaleHumpback_MexicoDPS_20210421, WhaleHumpback_WesternNorthPacificDPS_20210421, WhaleKiller_SouthernResidentDPS_20210802, WhaleNorthAtlanticRight_20160127, WhaleNorthPacificRight_20080408, Eulachon_SouthernDPS_20111020, SalmonChinook_CaliforniaCoastalESU_20050902, SalmonChinook_CentralValleyspringrunESU_20050902, SalmonChinook_LowerColumbiaRiverESU_20050902, SalmonChinook_PugetSoundESU_20050902_line, SalmonChinook_SacramentoRiverwinterrunESU_19930616_line, SalmonChinook_SnakeRiverfallrunESU_19931228, SalmonChinook_UpperColumbiaRiverspringrunESU_20050902, SalmonChinook_UpperWillametteRiverESU_20050902, SalmonChum_ColumbiaRiverESU_20050902, SalmonChum_HoodCanalsummerrunESU_20050902_line, SalmonCoho_CentralCaliforniaCoastESU_19990505, SalmonCoho_LowerColumbiaRiverESU_20160224, SalmonCoho_OregonCoastESU_20080211, SalmonSockeye_OzetteLakeESU_20050902, SalmonSockeye_SnakeRiverESU_19931228_line, SealHawaiianMonk_20150821_line, Steelhead_CaliforniaCentralValleyDPS_20050902, Steelhead_CentralCaliforniaCoastDPS_20050902, Steelhead_LowerColumbiaRiverDPS_20050902, Steelhead_MiddleColumbiaRiverDPS_20050902, Steelhead_NorthernCaliforniaDPS_20050902, Steelhead_PugetSoundDPS_20160224, Steelhead_SnakeRiverBasinDPS_20050902, Steelhead_SouthCentralCaliforniaCoastDPS_20050902, Steelhead_SouthernCaliforniaDPS_20050902, Steelhead_UpperColumbiaRiverDPS_20050902, Steelhead_UpperWillametteRiverDPS_20050902, SturgeonAtlantic_AtlanticSubspecies_CarolinaDPS_20170817, SturgeonAtlantic_AtlanticSubspecies_ChesapeakeBayDPS_20170817, SturgeonAtlantic_AtlanticSubspecies_GulfofMaineDPS_20170817, SturgeonAtlantic_AtlanticSubspecies_NewYorkBightDPS_20170817, SturgeonAtlantic_AtlanticSubspecies_SouthAtlanticDPS_20170817, SturgeonAtlantic_GulfSubspecies_20030319_line, SturgeonGreen_SouthernDPS_20091009_line, SealBearded_BeringiaDPS_20220401, SealRinged_ArcticSubspecies_20220401, All_critical_habitat_line_20220404, All_critical_habitat_poly_20230724, Proposed_SeaTurtleGreen_NorthAtlanticDPS_20230719, Proposed_SeaTurtleGreen_SouthAtlanticDPS_20230719, Proposed_SeaTurtleGreen_EastPacificDPS_20230719, Proposed_SeaTurtleGreen_CentralNorthPacificDPS_20230719, Proposed_SeaTurtleGreen_CentralSouthPacificDPS_20230719, Proposed_SeaTurtleGreen_CentralWestPacificDPS_20230719, Proposed_RicesWhale_20230724, CoralBoulderStar_20230809, CoralLobedStar_20230809, CoralMountainousStar_20230809, CoralPillar_20230809, CoralRoughCactus_20230809, GrouperNassau_20240102, Coral_AcroporaGlobiceps_20250715, Coral_AcroporaRetusa_20250715, Coral_AcroporaSpeciosa_20250715, Coral_FimbriaphylliaParadivisa_20250715, Coral_IsoporaCrateriformis_20250715
CH layer 'Proposed_RicesWhale_20230724': 1 features

6.2 Overlay raw polygons on rasterized cells

Code
# rasterize IUCN polygon using the same grid as the DB
# note: r_cell uses 0-360 longitude; polygon uses -180-180, so shift by +360
if (!is.null(sf_iucn)) {
  v_iucn <- vect(sf_iucn) |> shift(dx = 360)

  r_iucn_poly <- rasterize(v_iucn, r_cell, field = 1, background = NA)
  n_expected  <- sum(!is.na(values(r_iucn_poly)))
  message(glue(
    "Expected rng_iucn cells from polygon rasterization: {n_expected}"
  ))

  # compare to db count
  if ("rng_iucn" %in% names(r_layers)) {
    n_db <- sum(!is.na(values(r_layers[["rng_iucn"]])))
    message(glue(
      "DB rng_iucn cells: {n_db} vs polygon rasterization: {n_expected}"
    ))
    message(glue(
      "Ratio: {round(n_db / n_expected * 100, 1)}%",
      " — {ifelse(abs(n_db - n_expected) < 100, 'OK', 'CORRUPTION DETECTED')}"
    ))
  }

  # plot overlay
  if (n_expected > 0) {
    plot(trim(r_iucn_poly), main = "IUCN polygon rasterized (expected)")
  } else {
    message("No cells from polygon rasterization — CRS/extent issue")
  }
  if ("rng_iucn" %in% names(r_layers)) {
    plot(trim(r_layers[["rng_iucn"]]), main = "rng_iucn from DB (actual)")
  }
}
Expected rng_iucn cells from polygon rasterization: 1828
DB rng_iucn cells: 1969 vs polygon rasterization: 1828
Ratio: 107.7% — CORRUPTION DETECTED

7 Check for Broader Corruption

Query ALL species with rng_iucn data and check for suspicious patterns.

Code
# use current v3 database
con_check <- cons[["v3 current"]]

# check for species with identical cell sets across datasets
d_dup_cells <- dbGetQuery(
  con_check,
  "
  WITH ds_cells AS (
    SELECT tm.taxon_id, m.ds_key, m.mdl_seq, COUNT(mc.cell_id) as n_cells
    FROM taxon_model tm
    JOIN model m ON tm.mdl_seq = m.mdl_seq
    JOIN model_cell mc ON m.mdl_seq = mc.mdl_seq
    WHERE m.ds_key IN ('rng_iucn', 'ca_nmfs', 'ch_nmfs', 'ch_fws', 'rng_fws')
    GROUP BY tm.taxon_id, m.ds_key, m.mdl_seq
  )
  SELECT a.taxon_id, a.ds_key as ds_a, b.ds_key as ds_b,
         a.n_cells, a.mdl_seq as mdl_a, b.mdl_seq as mdl_b
  FROM ds_cells a
  JOIN ds_cells b ON a.taxon_id = b.taxon_id
    AND a.ds_key < b.ds_key
    AND a.n_cells = b.n_cells
  ORDER BY a.taxon_id, a.ds_key"
)

if (nrow(d_dup_cells) > 0) {
  message(glue(
    "{nrow(d_dup_cells)} taxon-dataset pairs with identical cell counts"
  ))
  d_dup_cells |>
    knitr::kable(
      caption = "Species with identical cell counts across datasets (potential corruption)"
    )
} else {
  message("No species found with identical cell counts across datasets")
}
1 taxon-dataset pairs with identical cell counts
Species with identical cell counts across datasets (potential corruption)
taxon_id ds_a ds_b n_cells mdl_a mdl_b
1576133 ca_nmfs rng_iucn 1969 23004 23505

7.1 Check rng_iucn cell counts for all species

Code
d_rng <- dbGetQuery(
  con_check,
  "
  SELECT
    tm.taxon_id,
    t.scientific_name,
    t.common_name,
    t.sp_cat,
    m.mdl_seq,
    COUNT(mc.cell_id) as n_cells,
    MIN(mc.value) as v_min,
    MAX(mc.value) as v_max
  FROM taxon_model tm
  JOIN model m ON tm.mdl_seq = m.mdl_seq
  JOIN model_cell mc ON m.mdl_seq = mc.mdl_seq
  LEFT JOIN taxon t ON tm.taxon_id = t.taxon_id
  WHERE m.ds_key = 'rng_iucn'
  GROUP BY tm.taxon_id, t.scientific_name, t.common_name, t.sp_cat, m.mdl_seq
  ORDER BY n_cells ASC"
)

message(glue(
  "Total species with rng_iucn: {nrow(d_rng)}, ",
  "cell count range: {min(d_rng$n_cells)} - {max(d_rng$n_cells)}"
))
Total species with rng_iucn: 1515, cell count range: 6 - 562423
Code
# show species with suspiciously few cells (bottom 20)
d_rng |>
  head(20) |>
  knitr::kable(
    caption = "Species with fewest rng_iucn cells (potential corruption)"
  )
Species with fewest rng_iucn cells (potential corruption)
taxon_id scientific_name common_name sp_cat mdl_seq n_cells v_min v_max
105901 Centrophorus squamosus leafscale gulper shark fish 19623 6 25 25
105827 Hydrolagus pallidus pale ghost shark fish 19680 6 1 1
271304 Myxine hubbsi NA fish 23444 11 1 1
220030 Nebrius ferrugineus tawny shark fish 19713 17 5 5
275411 Breviraja colesi Lightnose skate fish 19600 19 1 1
217356 Negaprion acutidens sicklefin lemon shark fish 19714 21 25 25
313100 Stegostoma tigrinum Zebra shark fish 19759 21 25 25
105849 Pristis pristis common sawfish fish 19724 32 50 50
316573 Bathytoshia brevicaudata Short-tail stingray fish 19594 37 1 1
712972 Taeniurops meyeni Round ribbontail ray fish 19761 40 5 5
275232 Syngnathus scovelli Gulf pipefish fish 19821 41 1 1
221382 Chlopsis dentatus mottled false moray fish 23192 52 1 1
271327 Rhizoprionodon longurio Pacific sharpnose shark fish 19737 73 5 5
271947 Ophichthus cylindroideus Dusky snake eel fish 23323 83 1 1
218539 Plectorhinchus gibbosus Harry Hotlips fish 19543 85 1 1
282384 Protosciaena trewavasae New Grenada drum fish 23133 96 1 1
426444 Conus cardinalis cardinal cone invertebrate 23037 105 2 2
218016 Hippichthys spicifer bellybarred pipefish fish 19798 109 1 1
271479 Gymnura marmorata California butterfly ray fish 19670 111 2 2
321650 Springeria folirostris Leaf-nose leg skate fish 19752 125 1 1

8 Timeline Reconstruction

Code
# compare Rice's whale rng_iucn across all DB versions
d_timeline <- map_dfr(names(cons), \(db_name) {
  con <- cons[[db_name]]
  tryCatch(
    {
      d <- dbGetQuery(
        con,
        glue(
          "
      SELECT m.ds_key, COUNT(mc.cell_id) as n_cells,
             MIN(mc.value) as v_min, MAX(mc.value) as v_max
      FROM taxon_model tm
      JOIN model m ON tm.mdl_seq = m.mdl_seq
      JOIN model_cell mc ON m.mdl_seq = mc.mdl_seq
      WHERE tm.taxon_id = {taxon_id_ricei}
        AND m.ds_key IN ('rng_iucn', 'ca_nmfs', 'ch_nmfs', 'ms_merge')
      GROUP BY m.ds_key"
        )
      )
      d$db_version <- db_name
      d
    },
    error = \(e) tibble()
  )
})

if (nrow(d_timeline) > 0) {
  d_timeline |>
    select(db_version, ds_key, n_cells, v_min, v_max) |>
    arrange(db_version, ds_key) |>
    knitr::kable(
      caption = "Rice's whale cell counts across DB versions (timeline)"
    )
} else {
  message("No timeline data found")
}
Rice’s whale cell counts across DB versions (timeline)
db_version ds_key n_cells v_min v_max
Feb 10 ca_nmfs 1969 100 100
Feb 10 ch_nmfs 2677 100 100
Feb 10 ms_merge 3617 100 100
Feb 10 rng_iucn 3938 50 50
Feb 12 (v3 backup) ca_nmfs 1969 100 100
Feb 12 (v3 backup) ch_nmfs 2677 100 100
Feb 12 (v3 backup) ms_merge 3617 100 100
Feb 12 (v3 backup) rng_iucn 1969 50 50
v3 current ca_nmfs 1969 100 100
v3 current ch_nmfs 2677 100 100
v3 current ms_merge 3617 100 100
v3 current rng_iucn 1969 50 50
v3b ca_nmfs 1969 100 100
v3b ch_nmfs 2677 100 100
v3b ms_merge 1676 43 43
v3b rng_iucn 1969 50 50
Code
message(
  "
Timeline of key events:
- Jan 22-23: IUCN range rasterization (6.2 hrs)
- Jan 29: backup sdm_2026-01-29.duckdb
- Feb 10: backup sdm_2026-02-10.duckdb
- Feb 11: fix_taxon_model_dups.qmd — cleaned duplicates
- Feb 12: calc_scores.qmd recalculated — backup sdm_v3_2026-02-12.duckdb

If Jan 29 backup already shows ~1,969 cells for rng_iucn, then the corruption
happened during or immediately after IUCN range rasterization (Jan 22-23).
Check if the rasterized TIF was correct (sf::summarize() with sf_use_s2(FALSE)
may have produced GEOMETRYCOLLECTION instead of valid polygon).
"
)

Timeline of key events:
- Jan 22-23: IUCN range rasterization (6.2 hrs)
- Jan 29: backup sdm_2026-01-29.duckdb
- Feb 10: backup sdm_2026-02-10.duckdb
- Feb 11: fix_taxon_model_dups.qmd — cleaned duplicates
- Feb 12: calc_scores.qmd recalculated — backup sdm_v3_2026-02-12.duckdb

If Jan 29 backup already shows ~1,969 cells for rng_iucn, then the corruption
happened during or immediately after IUCN range rasterization (Jan 22-23).
Check if the rasterized TIF was correct (sf::summarize() with sf_use_s2(FALSE)
may have produced GEOMETRYCOLLECTION instead of valid polygon).

9 Cleanup

Code
walk(cons, \(con) dbDisconnect(con, shutdown = TRUE))