Madagascar rural observatory system data validation, preparation, and georeferencing

Author

Florent Bédécarrats

Published

December 3, 2024

Introduction

This technical appendix is a companion to a data descriptor article, submitted to Nature Scientific Data, that focuses on data collected by the Rural Observatory System (ROS) between 1995 and 2015. It provides the source code for all figures and visualizations presented in the paper. It also offers a tutorial on how to georeference this data, which can serve as guidance for various types of analysis beyond this specific application. We use computational notebooks in Quarto format with the R programming language, combining code, results, explanations, and multimedia in an interactive way. The source code can be accessed by expanding code blocks like the following one, which produced the figure below.

Code
# Load required libraries
library(tidyverse)    # A series of packages for data manipulation
library(haven)        # Required for reading STATA files (.dta)
library(labelled)     # To work with labelled data from STATA
library(sf)           # for spatial data handling
library(tmap)         # for mapping
library(readxl)       # Read data frames to Excel format
library(cowplot)      # to combine plots

# Select appropriate folder as data source
data_path <- "data/dta_format/"

# Define a function to load and count surveys per observatory for a given year
load_and_count <- function(year, factorize = FALSE) {
  # Define file path
  file_path <- paste0(data_path, year, "/res_deb.dta")
  
  # Load data
  data <- read_dta(file_path)
  
  # Extract label and convert to factors if option
  if (factorize) {
    data <- data %>%
      mutate(across(everything(), as.character),
             across(where(is.labelled), ~ as.character(as_factor(.))))
  }
  
  # Count surveys per observatory
  count_data <- data %>%
    group_by(j0) %>%
    summarise(survey_count = n()) %>%
    ungroup() %>%
    mutate(year = year)  # Add year column
  
  return(count_data)
}

# Generate a list of years
years <- 1995:2015

# Use purrr::map_df to loop through each year and bind results
obs_count <- map_df(years, load_and_count) %>%
  # Remove rows with observatory "7 " and "NA", which are errors
  filter(j0 != 7 & !is.na(j0) & survey_count > 1) %>%
  rename(observatory = j0)

# Read observatory names
observatory_names <- readxl::read_xlsx("references/observatory_names.xlsx") %>%
  select(code, observatory_name = name)

# PAss it to wide.
obs_count <- obs_count %>%
  left_join(observatory_names, by = c("observatory" = "code")) %>%
  group_by(observatory_name, year) %>%
  summarise(survey_count = sum(survey_count))

obs_count_wide <- obs_count %>%
  pivot_wider(names_from = year, values_from = survey_count)

# Add observatory approximate location
locations <- tibble(
  code = c(1, 2, 3, 4, 12, 13, 15, 16, 21, 22, 23, 24, 31, 25, 41, 42, 43, 51, 
           44, 45, 61, 17, 18, 19, 71, 52),
  name = c("Antalaha", "Antsirabe", "Marovoay", "Toliara coastal", "Antsohihy", 
           "Tsiroanomandidy", "Farafangana", "Ambovombe", 
           "Alaotra", "Manjakandriana", "Toliara North", 
           "Fenerive East", "Bekily", "Mahanoro", "Itasy", 
           "Menabe-Belo", "Fianarantsoa", "Tsivory", "Morondava", "Manandriana", 
           "Tanandava", "Ihosy", "Ambohimahasoa", "Manakara", "Tolanaro", 
           "Menabe North-East"),
  latitude = c(-14.8833, -19.8659, -16.1000, -23.7574, -14.8796, -18.7713, 
               -22.8167, -25.1667, -17.8319, -18.9167, -23.2941, -17.3500, 
               -24.6900, -19.9000, -19.1686, -19.6975, -21.4527, -24.4667, 
               -20.2833, -20.2333, -22.5711, -22.4000, -20.7145, -22.1333, 
               -25.0381, -20.5486),
  longitude = c(50.2833, 47.0333, 46.6333, 43.6770, 47.9875, 46.0546, 47.8333, 
                46.0833, 48.4167, 47.8000, 43.7761, 49.4167, 45.1700, 48.8000,
                46.7354, 44.5419, 47.0857, 45.4667, 44.2833, 47.3833, 45.0439, 
                46.1167, 47.0389, 48.0167, 46.9562, 47.1597))

obs_count <- left_join(obs_count, locations, by = c("observatory_name" = "name"))


madagascar <- st_read(paste0("data/Spatial_data/OCHA_BNGRC admin boundaries/",
                             "mdg_admbnda_adm0_BNGRC_OCHA_20181031.shp"),
                      quiet = TRUE)

# Sort locations by latitude to generate sequence numbers
locations <- locations %>%
  arrange(desc(latitude)) %>%
  mutate(seq_num = 1:n())

# Create map plot with labels
map_plot <- ggplot(data = madagascar) +
  geom_sf(fill = "lightgray", colour = "dimgrey") +
  geom_point(data = locations, aes(x = longitude, y = latitude, color = name), 
             size = 3) +
  geom_text(data = locations, aes(x = longitude, y = latitude, label = seq_num), 
            vjust = -1, hjust = 1, size = 3, # check_overlap = TRUE,
            fontface = "bold") + 
  theme_void() +
  theme(legend.position = "none")

# Add sequence numbers to observatory names in obs_count dataframe
obs_count <- obs_count %>%
  left_join(locations %>%
              select(name, seq_num), 
            by = c("observatory_name" = "name")) %>%
  mutate(observatory_with_num = paste0(seq_num, ". ", observatory_name))

# Create timeline plot using modified obs_count with observatory_with_num
timeline_plot <- ggplot(obs_count, 
                        aes(x = year, 
                            y = fct_reorder(observatory_with_num, latitude), 
                            color = observatory_name)) +
  geom_point(aes(size = 1), show.legend = F) +
  theme_minimal() +
  labs(y = NULL, x = NULL) +
  theme(axis.text.y = element_text(size = 8, face = "bold"),
        legend.position = "none")

# Stitch the plots together
combined_plot <- plot_grid(map_plot, timeline_plot, rel_widths = c(1.3, 2))

ggsave("output/figure_1.pdf", plot = combined_plot, 
       width = 10, height = 7, dpi = 300)
ggsave("output/figure_1.png", plot = combined_plot, 
       width = 10, height = 7, dpi = 300)

print(combined_plot)

Figure 1: Coarse location of rural observatory and survey years

Figure 1 shows the coarse location of the 26 observatories composing the ROS, as well as the years in which data was collected in each one.