---
title: "Ingest BirdLife.org's Birds of the World (BOTW)"
format: html
editor_options:
chunk_output_type: console
---
## Overview
This script ingests the Birds of the World (BOTW) dataset from BirdLife.org, which is available as a GeoPackage file. The dataset contains information about bird species, their distributions, and other relevant data.
- Fix `terra::rotate()` : [ `rotate()\` not working `[-180, 180]` to `[0, 360]`: adding only one pixel west · Issue #1876 · rspatial/terra ](https://github.com/rspatial/terra/issues/1876)
```{r setup}
librarian:: shelf (
DBI, dplyr, DT, duckdb, fs, glue, here, logger, mapview, readr, sf, stringr,
rspatial/ terra, # 1.8-61 2025-07-22
tibble,
quiet = T)
options (readr.show_col_types = F)
is_server <- Sys.info ()[["sysname" ]] == "Linux"
dir_data <- ifelse (is_server, "/share/data" , "~/My Drive/projects/msens/data" ) |>
normalizePath ()
cell_tif <- glue ("{dir_data}/derived/r_bio-oracle_planarea.tif" )
er_gpkg <- glue ("{dir_data}/derived/ply_ecoregions_2025.gpkg" )
sdm_db <- glue ("{dir_data}/derived/sdm.duckdb" )
dir_bl <- glue ("{dir_data}/raw/birdlife.org" )
dir_rast <- glue ("{dir_bl}/rast" )
botw_gpkg <- glue ("{dir_bl}/BOTW_GPKG_2024_2/BOTW_2024_2.gpkg" )
log_txt <- glue ("{dir_rast}/_log 2025-07-23 01:04:17.350045.txt" )
```
Layers:
- `main_BL_HBW_Checklist_V9` : data frame with species information
- `all_species` : geometries (some species missing)...
```
[1] "Caloenas maculata" "Rhinoplax vigil" "Tanygnathus everetti"
[4] "Cissa thalassina" "Pycnonotus zeylanicus" "Pycnonotus bimaculatus"
[7] "Zosterops flavus" "Garrulax rufifrons" "Garrulax bicolor"
[10] "Pterorhinus courtoisi" "Leucopsar rothschildi" "Gracula robusta"
[13] "Cyornis banyumas" "Chloropsis cochinchinensis"
```
TODO:
- [ ] Evaluate application of fields (`presence` , `origin` , `seasonal` ) to
suitability value, based on descriptions in the documentation
(`Metadata_BirdLife_HBW_Bird_Maps_2024_2.docx` ).
```{r ck_botw_gpkg}
# explore layers in the GeoPackage ----
# (d_lyrs <- st_layers(botw_gpkg))
# layer_name geometry_type features fields crs_name
# 1 all_species Multi Polygon 17379 19 WGS 84
# 2 main_BL_HBW_Checklist_V9 NA 11195 15 <NA>
#
# ply_all <- read_sf(botw_gpkg, d_lyrs$name[1])
# Simple feature collection with 17379 features and 18 fields
# Geometry type: GEOMETRY
# Dimension: XY
# Bounding box: xmin: -180 ymin: -85.58276 xmax: 180 ymax: 89.97895
# Geodetic CRS: WGS 84
# # A tibble: 17,379 × 19
# sisid sci_name presence origin seasonal source compiler data_sens sens_comm dist_comm tax_comm
# <int> <chr> <int> <int> <int> <chr> <chr> <int> <chr> <chr> <chr>
# 1 2.27e7 Accipit… 1 1 2 del H… Philip … 0 " " " " " "
# 2 2.27e7 Accipit… 1 1 4 Xeno … Hannah … 0 " " " " " "
# 3 2.27e7 Accipit… 1 1 3 del H… Philip … 0 " " " " " "
# 4 2.27e7 Acridot… 4 1 1 BirdL… Rob Mar… 0 " " " " " "
# 5 2.27e7 Acridot… 1 1 1 eBird… Rob Mar… 0 " " " " " "
# 6 2.27e7 Acridot… 1 5 1 eBird… Rob Mar… 0 " " " " " "
# 7 2.27e7 Acridot… 1 3 1 Seng … Rob Mar… 0 " " " " " "
# 8 2.27e7 Aegoliu… 1 1 1 Ding,… Rob Mar… 0 " " " " " "
# 9 2.27e7 Aegoliu… 1 1 3 Cramp… Rob Mar… 0 " " " " " "
# 10 2.27e7 Aepypod… 1 1 5 Mauro… Rob Mar… 0 " " "" ""
# # ℹ 17,369 more rows
# # ℹ 8 more variables: generalisd <int>, citation <chr>, yrcompiled <int>, yrmodified <dbl>,
# # version <chr>, Shape_Length <dbl>, Shape_Area <dbl>, Shape <MULTIPOLYGON [°]>
# # ℹ Use `print(n = ...)` to see more rows
# ply_hbw <- read_sf(botw_gpkg, d_lyrs$name[2])
#
# sci_all <- unique(ply_all$sci_name)
# sci_hbw <- unique(ply_hbw$ScientificName)
#
# (sci_all_not_hbw <- setdiff(sci_all, sci_hbw))
# character(0)
# (sci_hbw_not_all <- setdiff(sci_hbw, sci_all))
# [1] "Caloenas maculata" "Rhinoplax vigil" "Tanygnathus everetti"
# [4] "Cissa thalassina" "Pycnonotus zeylanicus" "Pycnonotus bimaculatus"
# [7] "Zosterops flavus" "Garrulax rufifrons" "Garrulax bicolor"
# [10] "Pterorhinus courtoisi" "Leucopsar rothschildi" "Gracula robusta"
# [13] "Cyornis banyumas" "Chloropsis cochinchinensis"
#
# d_hbw_dupes <- ply_hbw |>
# st_drop_geometry() |>
# group_by(ScientificName) |>
# summarize(
# n = n(),
# common_name = first(CommonName),
# .groups = "drop")
# table(d_hbw_dupes$n)
# 1
# 11195
#
# CONCLUSION:
# - `main_BL_HBW_Checklist_V9` is data frame,
# - `all_species` the geometry (some species missing)
```
```{r setup_iter_spp_rast}
#| eval: false
d_spp <- read_sf (botw_gpkg, "main_BL_HBW_Checklist_V9" )
p_spp <- read_sf (botw_gpkg, "all_species" )
# table(d_spp$IUCN_RedList_Category_2024)
# CR CR (PE) CR (PEW) DD EN EW EX LC NT VU
# 204 18 1 38 395 5 164 8742 935 693
er1 <- read_sf (er_gpkg) |> st_union () # [-180, 180]
# TODO : check Pink-footed Shearwater _Ardenna creatopus_ VU
# https://datazone.birdlife.org/species/factsheet/pink-footed-shearwater-ardenna-creatopus
# table(st_geometry_type(ply_spp))
# MULTIPOLYGON MULTISURFACE
# 17,212 167
r_cell <- rast (cell_tif, lyrs = "cell_id" ) # [ 0, 360]
r_cell_r <- rotate (r_cell) # [-180, 180]
ext (r_cell_r) <- round (ext (r_cell_r), 3 )
spp <- unique (p_spp$ sci_name) |> sort ()
```
```{r iter_spp_rast}
#| eval: false
log_txt <- glue ("{dir_rast}/_log {Sys.time()}.txt" )
log_appender (appender_file (log_txt))
for (i in 1 : length (spp)){ # i = 102 # Acrocephalus_familiaris
sp <- spp[i] # sp <- "Acanthis flammea"
log_info ("| INIT | {sp} | {i}/{length(spp)}" )
r_tif <- glue ("{dir_rast}/{str_replace_all(sp, ' ', '_')}.tif" )
if (file.exists (r_tif)){
log_info ("| SKIP | {sp} | tif already exists" )
next ()
}
p <- p_spp |>
filter (
sci_name == !! sp,
presence %in% c (1 ,2 ,3 ),
# presence -- 1: Extant, 2: Probably Extant, 3: Possibly Extant
st_geometry_type (geom) != "MULTISURFACE" ) |>
mutate (
value = 50 L) # default value of 50%
if (nrow (p) == 0 ){
log_info ("| SKIP | {sp} | no valid geometry for species" )
next ()
}
# mapView(p)
p <- try (
p |>
st_make_valid () |>
st_filter (er1, .predicate = st_intersects),
silent = T)
# mapView(p)
if (inherits (p, "try-error" )){
log_info ("| SKIP | {sp} | error in st_make_valid(), st_filter(): {p}" )
next ()
}
if (nrow (p) == 0 ){
log_info ("| SKIP | {sp} | not intersecting ecoregions" )
next ()
}
r <- try (
rasterize (
p,
r_cell_r,
field = "value" ) |>
# trim() |>
# mapView()
rotate () |>
crop (r_cell) |>
mask (r_cell),
silent = T)
# plot(trim(r))
if (inherits (r, "try-error" )){
log_info ("| SKIP | {sp} | error in rasterize(),...: {r}" )
next ()
}
n_cells <- length (values (r, na.rm = T))
if (n_cells == 0 ){
log_info ("| SKIP | {sp} | rast has no values" )
next ()
}
names (r) <- sp
round (ext (r), 3 )
# plet(rotate(r))
# as.polygons(rotate(r)) |> mapView()
log_info ("| WRITE | {sp} | {format(n_cells, big.mark=',')} cells" )
writeRaster (
r,
filename = r_tif,
overwrite = T)
}
# INFO [2025-07-23 00:26:20] 1/11181: Abeillia abeillei
# INFO [2025-07-23 00:31:19] 261/11181: Agelaius xanthomus
# (260/5) * 11181 / 60 = 4 hrs to completion
# d <- as.data.frame(r, cells = T, na.rm = T) |>
# tibble() |>
# # cell | `Acanthis flammea`
# select(cell_id = cell, value = 2)
# dbWriteTable(con_sdm, "species", d, append = TRUE)
# cell_id value
# <int> <int>
# 1 276146 1
# 2 276147 1
# 3 276148 1
# ply_spp_er |>
# st_drop_geometry() |>
# group_by(sci_name) |>
# summarize(
# n_ply = n(),
# .groups = "drop") |>
# left_join(
# d_spp,
# by = c("sci_name" = "ScientificName")) |>
# pull(IUCN_RedList_Category_2024) |>
# table()
# TODO : check out "Struthio camelus", "Aegolius funereus"
# TODO : fill out other lookup fields
# mutate(
# origin = case_when(
# origin == 1 ~ "Native",
# origin == 2 ~ "Reintroduced",
# origin == 3 ~ "Introduced",
# origin == 4 ~ "Vagrant",
# origin == 5 ~ "Origin Uncertain",
# origin == 6 ~ "Assisted Colonisation",
# TRUE ~ NA_character_))
```
```{r tbl_log}
d_log <- readLines (log_txt) |>
str_subset ("^INFO" ) |> # remove extra lines from error output
paste (collapse = " \n " ) |>
read_delim (
delim = " | " ,
col_names = c ("info" , "type" , "sp" , "msg" )) |>
mutate (across (everything (), str_trim)) |>
mutate (
time = str_extract (info, " \\ d{4}- \\ d{2}- \\ d{2} \\ d{2}: \\ d{2}: \\ d{2}" ) |>
as.POSIXct (),
level = str_extract (info, "INFO|DEBUG|ERROR|WARN" )) |>
filter (type != "INIT" ) |>
mutate (
msg = str_replace (msg, "error.+" , "error reading geometry" )) |>
mutate (
tif_created = case_when (
msg == "tif already exists" ~ T, # 39
type == "WRITE" ~ T, # 496
.default = F), # 535: sum
has_problem = case_when (
type == "SKIP" & ! (msg %in% c (
"not intersecting ecoregions" ,
"tif already exists" )) ~ T, # T: 443
.default = F)) # F: 10,620
# DEBUG ----
# summarize
list (
` TIF created ` = sum (d_log$ tif_created),
` outside USA ` = d_log |> filter (! tif_created, ! has_problem) |> nrow (),
` has problem ` = sum (d_log$ has_problem)) |>
enframe (name = "status" , value = "count" ) |>
mutate (
count = unlist (count)) |>
datatable (
caption = "Summary of TIF creation and issues" ,
options = list (dom = 't' )) |>
formatRound (columns = "count" , digits = 0 , mark = "," )
d_log |>
filter (tif_created) |>
select (sp) |>
arrange (sp) |>
select (species = sp) |>
datatable (
caption = "Species with created TIF files intersecting USA waters." )
d_log |>
filter (! tif_created, ! has_problem) |>
select (sp) |>
arrange (sp) |>
select (species = sp) |>
datatable (
caption = "Species without TIF files not intersecting USA waters." )
d_log |>
filter (has_problem) |>
arrange (sp) |>
select (species = sp, problem = msg) |>
datatable (
caption = "Species with problems encountered during processing." )
d_log |>
filter (has_problem) |>
select (problem = msg) |>
group_by (problem) |>
summarize (
count = n (),
.groups = "drop" ) |>
arrange (desc (count)) |>
datatable (
caption = "Problems encountered during processing" ,
options = list (dom = 't' ))
```
- `error reading geometry` : even after applying `sf::st_make_valid()` , loops are detected when checking for intersection with ecoregions.
- `no valid geometry for species` : no valid geometry after applying filter for
presence (`1` : Extant, `2` : Probably Extant, `3` : Possibly Extant), and removing
geometry types of `MULTISURFACE` .
- `rast has no values` : the raster has no values after rasterizing the geometries,
despite the geometry being valid and intersecting with ecoregions.
```{r stop_eval}
# knitr::opts_chunk$set(eval = F)
```
## Insert into Database
```{r init_db}
# con_spp <- dbConnect(duckdb(dbdir = spp_db, read_only = T))
con_sdm <- dbConnect (duckdb (dbdir = sdm_db, read_only = F))
# dbListTables(con_sdm)
# tbl(con_sdm, "species") |>
# filter(common_name_dataset == "blue whale") |>
# collect() |>
# View()
```
### Add `dataset`
```{r insert_dataset}
ds_key <- "bl"
row_dataset <- tibble (
ds_key = !! ds_key,
name_short = "BirdLife Birds of the World, 2024" ,
name_original = "BirdLife International and Handbook of the Birds of the World (2024)" ,
description = "This dataset contains digital distribution information for the world’s birds. It is a joint product of BirdLife International and Handbook of the Birds of the World." ,
citation = "" ,
source_broad = "BirdLife" ,
source_detail = "http://datazone.birdlife.org" ,
# TODO : citation
# BirdLife International and Handbook of the Birds of the World (2024) Bird species distribution maps of the world. Version 2024.2. Available at http://datazone.birdlife.org/species/requestdis.
regions = "Global" ,
response_type = "binary" ,
taxa_groups = "birds" ,
year_pub = 2024 ,
date_obs_beg = NA ,
date_obs_end = NA ,
date_env_beg = NA ,
date_env_end = NA ,
link_info = "http://datazone.birdlife.org/species/requestdis" ,
link_download = "http://datazone.birdlife.org/species/requestdis" ,
link_metadata = "https://www.fisheries.noaa.gov/inport/item/65207" ,
links_other = "https://noaa.maps.arcgis.com/apps/webappviewer/index.html?id=68d8df16b39c48fe9f60640692d0e318" ,
spatial_res_deg = 0.05 ,
temporal_res = "static" )
if (dbExistsTable (con_sdm, "dataset" ))
dbExecute (con_sdm, glue ("DELETE FROM dataset WHERE ds_key = '{ds_key}'" ))
dbWriteTable (con_sdm, "dataset" , row_dataset, append = TRUE )
```
### Add `species`
```{r insert_species}
r_spp <- rast (dir_ls (dir_rast, glob = "*.tif" ))
names (r_spp)
```
## Close Database
```{r close_db}
dbDisconnect (con_sdm, shutdown = T); rm (con_sdm)
```
::: {.callout-caution collapse="true"}
## Session Info
```{r session_info}
devtools:: session_info ()
```
:::