class_results <- class_results %>%
mutate(
pa = str_extract(
textes,
regex("[\"«](.*?)[\"»,]|nommée\\s*(.*?)[»,]", ignore_case = TRUE)
),
pa = ifelse(
is.na(pa),
str_extract(
textes,
regex("nommée\\s*(.*?)[\"]", ignore_case = TRUE)
),
pa
),
pa = ifelse(
str_detect(textes, "respectivement"),
str_extract(
textes,
regex("respectivement\\s+(.*?)(?=\\.|N°|ETAT)", ignore_case = TRUE)
),
pa
),
# Add pattern for "station forestière d'" or "réserve de faune"
pa = ifelse(
is.na(pa) & str_detect(textes, "station forestière d'"),
str_extract(
textes,
regex("(?<=station forestière d')[^,]+", ignore_case = TRUE)
),
pa
),
pa = str_replace(pa, "\"Complexe des Aires", "Complexe des Aires"),
pa = str_remove(pa, "^nommée\\s*"), # Supprime "nommée"
pa = str_remove(pa, "^respectivement\\s*"), # Supprime "nommée"
pa = str_remove(pa, "(?<=\")[^\"«]*$"), # Supprime tout après le 2e guillemet
pa = str_remove_all(pa, "[\"«»]"), # Supprime les quotes
pa = str_trim(pa), # Supprime les espaces en trop
pa = str_remove(pa, ",$"),
pa = str_remove(pa, "^'"),
.before = textes
)
# Cleans PA name columns only (e.g. NOM, SHORT_NAME, etc.)
# You must specify which columns to clean
clean_pa_names_cols <- function(df, name_cols) {
df %>%
mutate(across(
all_of(name_cols),
~ .x %>%
str_replace_all("[\r\n\t]", " ") %>% # replace line breaks, carriage returns, tabs with space
str_squish() %>% # collapse multiple spaces into one
str_trim() # remove leading/trailing whitespace
))
}
# Clean extracted PA names (remove \r, \n, tabs, trim spaces)
class_results <- clean_pa_names_cols(class_results, name_cols = "pa")
pn_rs <- c(
"Andohahela",
"Nosy Mangabe",
"Montagne d'Ambre",
"Ankarafantsika",
"Analamazaotra",
"Kirindy Mite",
"Tsimanampesotse",
"Mikea",
"Nosy Hara",
"Nosy Tanikely",
"Lokobe",
"Ankarafantsika",
"Tsimanampetsotsa",
"Namokora",
"Mantadia",
"Marojejy",
"Kirindy Mite",
"Befotaka Midongy",
"Zombitse-Vohibasia",
"Baie de Baly",
"Tsingy-de-Bemaraha",
"Masoala",
"Mantadia",
"Isalo",
"montagne d'Ambre",
"Lokobe",
"Ankarafantsika",
"Tsimanampetsotsa",
"Namokora",
"Marojejy",
"Tsingy-de-Bemaraha",
"Andringitra",
"Ankarafantsika",
"Ankarafantsika",
"Nosy Mangabe",
"Montagne d'Ambre",
"Manongarivo",
"Ambatovaky",
"Beza-Mahafaly",
"Cap Sainte Marie",
"Anjanaharibe-Sud",
"forêt d'Ambohitantely",
"Manombo",
"île de Mangabe",
"Ambatovavy",
"pic d'Ivohibe",
"Mangerivola",
"Manombo",
"cap Sainte-Marie",
"forêt d'Ambre",
"forêt Tampoketsa d'Analamaitso",
"Andranomena",
"Anjanaharibe-Sud",
"Ambohijanahary",
"Pointe à Larrée"
) |>
unique() |>
sort()
class_results <- class_results %>%
mutate(
textes = str_replace_all(textes, "[\r\n]", " "),
textes = str_replace_all(textes, "/", " "),
textes = str_squish(textes),
pa = ifelse(
is.na(pa),
map_chr(
textes,
~ first(pn_rs[str_detect(.x, pn_rs)], default = NA_character_)
),
pa
), # for some unkown reason, "Pointe à Larrée is not recognized
pa = ifelse(num_texte == "2015-773", "Pointe à Larrée", pa), # Explicit assignment
pa = ifelse(num_texte == "98-376", "Andrigitra", pa) # Explicit assignment
)
class_results <- class_results %>%
mutate(
# For 2008, we find the list of PA created then
pa = ifelse(
num_texte == "18633/2008" |
num_texte == "52005/2010" |
num_texte == "9874/2013",
read_rds("data/no_id/sapm_2010.rds") |>
filter(YEAR_IMPLE == "X") |>
pluck("NOM") |>
paste(collapse = ", "),
pa
)
)