5  Consolidation des noms

Nous chargeons les différents jeux de données spatiaux provenant de sources hétérogènes (ANGAP, SAPM, WDPA, Vahatra) ainsi que les données légales extraites manuellement. Ils seront harmonisés et fusionnés dans un jeu unique de référence.

Code
# Load required libraries
library(tidyverse)
library(sf)
library(arrow)
library(geoarrow)
library(stringdist)
library(wdpar)


# Read all spatial datasets to be consolidated
angap_2002 <- read_rds("data/no_id/angap_2002.rds")
sapm_2010  <- read_rds("data/no_id/sapm_2010.rds")
sapm_2017  <- read_rds("data/no_id/sapm_2017.rds")
sapm_2024  <- read_rds("data/no_id/sapm_2024.rds") 
sapm_evol_2001_2011 <- read_rds("data/no_id/sapm_evol_2001_2011.rds")
vahatra <- read_rds("data/no_id/vahatra.rds")
wdpa_mdg_2025 <- wdpa_read("sources/WDPA_WDOECM_mar2025_Public_MDG.zip") %>%
  wdpa_clean(erase_overlaps = FALSE)

# Read WDPA historical parquet file
wdpa_mdg_all <- read_parquet("data/MDG_WDPA_Consolidated.parquet")  %>%
  mutate(geometry = st_as_sfc(geometry)) %>%
  st_as_sf() |> 
  mutate(dataset_id = paste0("WDPA_", data_year)) |> 
  select(data_year)

# Load legal data
legal <- read_rds("data/id/legal_texts.rds")

5.1 Traitement des métadonnées ANGAP (2002)

Le champ CLASSEMENT contient des références légales souvent partielles ou hétérogènes. Une extraction manuelle a permis d’en dériver les champs date_texte, type_texte, num_texte (catégories existantes dans les données CNLEGIS), et une année de statut STATUS_YR. Ces champs sont intégrés au jeu de données harmonisé.

Code
angap_2002 <- angap_2002 |> 
  rename(NAME = NOM, DESIG = TYPE_AP, IUCN_CAT = IUCN, LEGAL_TXT = CLASSEMENT)

legal_txt_content <- tribble(
  ~num_ligne, ~date_texte,        ~type_texte, ~num_texte,
  1,          "1958-10-28",       "Décret",    "58.10",
  2,          "1997-03-02",       "Décret",    "97-141",
  3,          "1958-10-28",       "Décret",    "58.08",
  4,          "1982-02-12",       "Décret",    "82.078",
  5,          "1970-07-21",       "Décret",    "2778-MAER/SEGREF/FOR",
  6,          "1956-02-20",       "Décret",    "56.208",
  7,          "1997-08-07",       "Décret",    "97.1043",
  8,          "1958-10-28",       "Décret",    "58.13",
  9,          "1998-10-19",       "Décret",    "98.376",
  10,         "1958-10-28",       "Décret",    "58.12",
  11,         NA,                 NA,          NA,
  12,         "1956-02-20",       "Décret",    "56.208",
  13,         "1997-12-18",       "Décret",    "97-1452",
  14,         NA,                 NA,          NA,
  15,         "1956-09-10",       "Décret",    NA,
  16,         "1966-06-01",       "Décret",    "66.242",
  17,         "1986-06-04",       "Décret",    "86.168",
  18,         "1966-04-22",       "Décret",    "64-159",
  19,         "1962-10-24",       "Décret",    "62.527",
  20,         "1958-10-28",       "Décret",    "58.15",
  21,         "1997-03-02",       "Décret",    "97-141",
  22,         "1962-07-19",       "Décret",    "62.371",
  23,         "1959-04-24",       "Décret",    "58.59",
  24,         "1956-09-10",       "Décret",    NA,
  25,         "1997-12-18",       "Décret",    "97.1453",
  26,         "1966-06-01",       "Décret",    "66.242",
  27,         "1989-07-25",       "Décret",    "89-216",
  28,         "1989-07-25",       "Décret",    "89-216",
  29,         "1958-10-28",       "Décret",    "58.10",
  30,         "1956-02-20",       "Décret",    "56.208",
  31,         "1962-12-05",       "Décret",    "62.637",
  32,         "1956-02-20",       "Décret",    "56.208",
  33,         NA,                 NA,          NA,
  34,         "1998-05-19",       "Décret",    "98.375",
  35,         "1956-02-20",       "Décret",    "56.208",
  36,         "1997-03-02",       "Décret",    "97-141",
  37,         NA,                 NA,          NA,
  38,         "1997-12-18",       "Décret",    "97.1451",
  39,         "1958-10-28",       "Décret",    "58.07",
  40,         "1965-12-14",       "Décret",    "95.795", 
  41,         "1954-09-16",       "Décret",    "64.380",
  42,         "1989-07-25",       "Décret",    "89-216",
  43,         NA,                 NA,          NA,
  44,         "1958-10-28",       "Décret",    "58.14",
  45,         "1997-03-02",       "Décret",    "97-141",
  46,         "1966-06-01",       "Décret",    "66.242",
  47,         "1966-06-01",       "Décret",    "66.242",
  48,         "1997-08-07",       "Décret",    "97-1045",
  49,         "1966-06-01",       "Décret",    "66.242",
  50,         "1966-06-01",       "Décret",    "66.242",
  51,         "1997-08-07",       "Décret",    "97-1045",
  52,         NA,                 NA,          NA,
  53,         "1997-12-18",       "Décret",    "97-1454",
  54,         "1997-08-07",       "Décret",    "97.1044",
  55,         "1966-06-01",       "Décret",    "66.242",
  56,         "1997-12-18",       "Décret",    "97-1454"
) %>% 
  mutate(
    STATUS_YR = case_when(
      num_ligne %in% c(16, 46, 47, 49, 50, 55) ~ 1927,
      TRUE ~ lubridate::year(lubridate::ymd(date_texte))
    )
  )
angap_2002 <- angap_2002 %>%
  bind_cols(
    legal_txt_content %>%
      select(-num_ligne)
  ) |> 
  select(-LEGAL_TXT) |> 
  mutate(dataset_id = "ANGAP_2002")

5.2 Harmonisation des autres sources (SAPM, Vahatra)

Les autres jeux de données sont également harmonisés avec un sous-ensemble de variables cibles. On applique des transformations simples et un renommage standardisé pour assurer l’interopérabilité.

Code
sapm_2010 <- sapm_2010 |>
  select(NAME = NOM, DESIG = DESCRIPTIO, MANG_AUTH = DATAADMIN) |> 
  mutate(DESIG = str_replace(DESIG, "Protge", "Protégée"),
         dataset_id = "SAPM_2010")

sapm_evol_2001_2011 <- sapm_evol_2001_2011 |> 
  mutate(YEAR = as.numeric(YEAR)) |> 
  select(NAME = NOM, DESIG = DESCRIPTIO, MANG_AUTH = DATAADMIN,
         STATUS_YR = YEAR) |> 
  mutate(dataset_id = "SAPM_evol_2001-2011")

sapm_2017 <- sapm_2017 |>
  mutate(DATE_CREAT = year(ymd(DATE_CREAT)),
         MANG_AUTH = paste(GEST_1, GEST_2),
         DAT_ST_DEF = as.character(DAT_ST_DEF)) |> 
  select(NAME = SHORT_NAME, ORIG_NAME = FULL_NAME, 
         IUCN_CAT = CATEG_IUCN, STATUS_YR = DATE_CREAT,
         date_texte = DAT_ST_DEF, MARINE = TYPE_AP, 
         GOV_TYPE = GOUVERNANC, num_text = STAT_DEF, 
         MANG_AUTH) |> 
  mutate(dataset_id = "SAPM_2017")

sapm_2024 <- sapm_2024 |>
  mutate(DATE_CREAT = year(ymd(DATE_CREAT)),
         MANG_AUTH = paste(GEST_1, GEST_2),
         DAT_ST_DEF = as.character(DAT_ST_DEF)) |> 
  select(NAME = SHORT_NAME,  
         IUCN_CAT = CATEG_IUCN, STATUS_YR = DATE_CREAT,
         date_texte = DAT_ST_DEF, MARINE = TYPE_AP, 
         GOV_TYPE = GOUVERNANC, num_text = STAT_DEF, 
         MANG_AUTH) |> 
  mutate(dataset_id = "SAPM_2024")

vahatra <- vahatra |> 
  mutate(STATUS_YR = year(ymd(date_creation)),
         MANG_AUTH = paste(gest_1, gest_2),
         date_texte = ifelse(is.na(date_modification), date_creation,
                             paste(date_creation, date_modification, sep = ";"))) |> 
  select(NAME = nom, IUCN_CAT = cat_iucn, ORIG_NAME = full_name,
         num_texte = creation, MANG_AUTH, STATUS_YR, date_texte) |> 
  mutate(dataset_id = "Vahatra")

5.3 Fusion dans un jeu consolidé

Tous les jeux de données sont fusionnés dans un unique tableau avec un schéma unifié.

Code
all_PAs <- bind_rows(
  angap_2002,
  sapm_2010,
  sapm_2017,
  sapm_2024,
  sapm_evol_2001_2011,
  vahatra
)

Appariement spatial avec la WDPA

On utilise une fonction pa_match() pour identifier les aires protégées correspondantes dans la WDPA, sur la base de la proximité spatiale et du taux de recouvrement géométrique.

Code
# Match protected areas from x to y based on centroid distance and spatial overlap.
# Returns the original x with best matches from y, using overlap thresholds and ranking.
# Adds matched name, ID, and overlap percentage to x.
pa_match <- function(x, y,
                     pa_name_x = "NAME",
                     pa_name_y = "NAME",
                     pa_id_y = "WDPAID",
                     threshold_strong_match = 0.35,
                     threshold_weak_match = 0.1) {
  
  # Harmonize CRS and fix geometry
  y <- st_transform(y, st_crs(x)) %>%
    st_make_valid()
  
  # Prepare x and y with centroids
  x_aug <- x %>%
    mutate(index_x = row_number(),
           name_x = .data[[pa_name_x]],
           centroid_x = st_centroid(geometry))
  
  y_aug <- y %>%
    mutate(index_y = row_number(),
           name_y = .data[[pa_name_y]],
           id_y = .data[[pa_id_y]],
           centroid_y = st_centroid(geometry))
  
  # Find 3 closest y for each x
  x_matches <- x_aug %>%
    rowwise() %>%
    mutate(
      closest_y = list(
        y_aug %>%
          st_drop_geometry() %>%
          mutate(dist = as.numeric(st_distance(centroid_x, centroid_y))) %>%
          arrange(dist) %>%
          slice_head(n = 2) %>%
          select(index_y, dist)
      )
    ) %>%
    unnest(closest_y)
  
  # Compute spatial overlaps
  matches_with_overlap <- x_matches %>%
    rowwise() %>%
    mutate(
      overlap = tryCatch({
        inter <- st_intersection(geometry, y_aug$geometry[index_y])
        if (length(inter) > 0) {
          round(as.numeric(st_area(inter) / st_area(geometry)), 3)
        } else {
          0
        }
      }, error = function(e) 0)
    ) %>%
    ungroup() %>%
    mutate(
      name_y = y_aug[[pa_name_y]][index_y],
      id_y = y_aug[[pa_id_y]][index_y]
    ) %>%
    st_drop_geometry()
  
  # Rank and apply selection logic
  best_matches <- matches_with_overlap %>%
    group_by(index_x) %>%
    arrange(desc(overlap), dist) %>%
    mutate(rank = row_number()) %>%
    slice(1:2) %>%
    summarise(
      index_x = first(index_x),
      name_y = case_when(
        n() >= 2 && overlap[2] > threshold_strong_match && overlap[2] > overlap[1] ~ name_y[2],
        overlap[1] < threshold_weak_match ~ NA_character_,
        TRUE ~ name_y[1]
      ),
      id_y = case_when(
        n() >= 2 && overlap[2] > threshold_strong_match && overlap[2] > overlap[1] ~ id_y[2],
        overlap[1] < threshold_weak_match ~ NA_integer_,
        TRUE ~ id_y[1]
      ),
      overlap_y = case_when(
        n() >= 2 && overlap[2] > threshold_strong_match && overlap[2] > overlap[1] ~ overlap[2],
        overlap[1] < threshold_weak_match ~ NA_real_,
        TRUE ~ overlap[1]
      ),
      .groups = "drop"
    )
  
  # Join results back to x
  output <- x_aug %>%
    left_join(best_matches, by = "index_x") %>%
    select(-index_x, -name_x, -centroid_x)
  
  return(output)
}


all_PAs_matched <- pa_match(all_PAs, wdpa_mdg_2025)

all_PAs_unmatched <- all_PAs_matched %>%
  filter(is.na(name_y))

words_to_exlcude <- c("tsingy de",
                      "corridor entre parcelles i et ii d'",
                      "corridor", 
                      "extension",
                      "for[eê]t d['’]?", "aire protégée( d['’]?)?",
                      "androka",
                      "Maromena"
)

# Enhanced fallback matching function for protected areas
pa_match_fallback_name <- function(x, y,
                                   pa_name_x = "NAME",
                                   pa_name_y = "NAME",
                                   pa_id_y = "WDPAID",
                                   remove_terms,
                                   min_stringdist = 0.08) {
  
  # Clean and normalize names
  clean_name <- function(s) {
    pattern <- str_c(remove_terms, collapse = "|")
    s %>%
      str_to_lower() %>%
      str_replace_all(pattern, "") %>%
      str_squish()
  }
  
  # Create index for reinsertion
  x <- x %>% mutate(row_index = row_number())
  
  # Extract unmatched rows
  x_unmatched <- x %>%
    filter(is.na(name_y)) %>%
    mutate(name_x_raw = .data[[pa_name_x]],
           name_x_clean = clean_name(name_x_raw)) %>%
    distinct(name_x_clean, .keep_all = TRUE) %>%
    mutate(index_x = row_number())
  
  # Prepare y
  y_tbl <- y %>%
    st_drop_geometry() %>%
    mutate(name_y_raw = .data[[pa_name_y]],
           name_y_clean = clean_name(name_y_raw),
           id_y = .data[[pa_id_y]],
           index_y = row_number())
  
  # String distance matrix
  distance_matrix <- stringdistmatrix(x_unmatched$name_x_clean,
                                      y_tbl$name_y_clean,
                                      method = "jw", p = 0.1)
  
  # Get best match per unmatched row
  closest_matches <- tibble(
    index_x = rep(x_unmatched$index_x, times = nrow(y_tbl)),
    index_y = rep(y_tbl$index_y, each = nrow(x_unmatched)),
    dist = as.vector(distance_matrix)
  ) %>%
    group_by(index_x) %>%
    slice_min(dist, n = 1, with_ties = FALSE) %>%
    ungroup()
  
  # Join metadata
  matched <- closest_matches %>%
    left_join(x_unmatched, by = "index_x") %>%
    left_join(y_tbl, by = "index_y") %>%
    filter(dist < min_stringdist)
  
  # Compute overlap with matched geometry
  matched <- matched %>%
    rowwise() %>%
    mutate(overlap_y = tryCatch({
      inter <- st_intersection(geometry, y$geometry[index_y])
      if (length(inter) > 0) round(as.numeric(st_area(inter) / st_area(geometry)), 3) else 0
    }, error = function(e) NA_real_)) %>%
    ungroup()
  
  # Replace original rows
  x_updated <- x %>%
    left_join(
      matched %>%
        select(row_index, name_y = name_y_raw, id_y   = id_y.y, overlap_y, 
               stringdist = dist),
      by = "row_index"
    ) %>%
    mutate(
      name_y = coalesce(name_y.y, name_y.x),
      id_y = coalesce(id_y.y, id_y.x),
      overlap_y = coalesce(overlap_y.y, overlap_y.x),
      stringdist = if ("stringdist.x" %in% names(.)) coalesce(stringdist, stringdist.x) else stringdist
    ) %>%
    select(-row_index, -ends_with(".x"), -ends_with(".y"), -stringdist)
  
  return(x_updated)
} 

# Perform spatial matching
all_PAs_matched <- pa_match_fallback_name(x  = all_PAs_matched, 
                                          y = wdpa_mdg_2025,
                                          remove_terms = words_to_exlcude) %>%
  rename(WDPA_NAME = name_y, WDPAID = id_y, overlap_WPDA = overlap_y)

5.4 Ajout des données légales issues des textes officiels

On ajoute les aires protégées mentionnées dans les décrets à travers le jeu legal contenant les données CNLEGIS. Ces données ne comportent pas de géométrie, aussi on en ajoute une vide afin de pouvoir les consolider dans la même table.

Code
# Format to be compatible with the spatial data
legal_conso <- legal %>%
  mutate(
    NAME = ap_nom_texte,
    dataset_id = "CNLEGIS_2024",
    date_texte = as.character(date_texte),
    geometry = st_sfc(rep(st_geometrycollection(), n()), crs = 4326)) |> 
  select(NAME, dataset_id, date_texte, type_texte, num_texte, type_decision, WDPA_NAME, WDPAID, geometry) %>%
  st_as_sf()

all_PAs_matched <- all_PAs_matched %>%
  st_transform(st_crs(legal_conso)) %>%
  bind_rows(legal_conso)

5.5 Export final

On ajoute enfin les données WDPA de mars 2025. Le jeu consolidé est exporté dans des formats interopérables (.parquet et .rds).

Code
all_PAs_conso <- wdpa_mdg_2025 |> 
  mutate(dataset_id = "WDPA_2025") |> 
  st_transform(st_crs(all_PAs_matched))  |> 
  bind_rows(all_PAs_matched)
all_PAs_conso |> 
  write_parquet("data/id/all_PAs_conso.parquet")
all_PAs_conso |> 
  write_rds("data/id/all_PAs_conso.rds")