4  Data preparation

In this section, we describe and reproduce the procedure followed to prepare the Rural Observatory System (ROS) data. We made all the operation on the raw source data explicitly coded, and the data published online is the result of applying the following steps. This process entails selecting the relevant files and organizing them in the local file system, removing the names and replacing them with pseudonyms, enriching the metadata with table names, and converting the file formats to propose an open standard version alongside STATA.

Code
library(tidyverse)    # A series of packages for data manipulation
library(haven)        # Required for reading STATA files (.dta)
library(tidyverse)
library(stringdist)
library(tictoc)
library(progressr)
library(future)
library(furrr)
library(gt)
library(readxl)
library(fs)

4.1 Data selection

First, we start with creating a copy of the original unfiltered and un-anonymized data.

Code
# Define the paths for the source and target folders
source_folder <- "data/ROS_data_original"
target_folder <- "data/dta_format"

# Create the target folder if it does not exist
if (!dir.exists(target_folder)) {
  dir.create(target_folder, recursive = TRUE)
}

# Empty the target folder if it already contains files
if (length(list.files(target_folder, recursive = TRUE)) > 0) {
  # List all files within the target folder recursively
  files_to_remove <- list.files(target_folder, full.names = TRUE, recursive = TRUE)
  # Remove these files
  file.remove(files_to_remove)
}

# Function to recursively copy files from source to target
copy_files <- function(source, target) {
  # Ensure the target directory exists
  if (!dir.exists(target)) {
    dir.create(target, recursive = TRUE)
  }
  
  # List all files and directories in the source
  contents <- list.files(source, full.names = TRUE)
  
  # Separate files and directories
  dirs <- contents[which(sapply(contents, function(x) file.info(x)$isdir))]
  files <- contents[which(sapply(contents, function(x) !file.info(x)$isdir))]
  
  # Copy files
  if (length(files) > 0) {
    file.copy(files, target, overwrite = TRUE)
  }
  
  # Recursively copy directories
  if (length(dirs) > 0) {
    for (dir in dirs) {
      new_source <- dir
      new_target <- file.path(target, basename(dir))
      copy_files(new_source, new_target)
    }
  }
}

# Copy all files and folders from source to target
copy_files(source_folder, target_folder)

For 2015, we have 4 observatories. One that existed in the previous years, Menabe Nord-Est, and 3 new ones: Ambatofinandrahana, Anjozorobe et Maintirano. The data collection of the ROS continued until 2017, but we lack documentation since 2015 and the data has not yet been harmonized for 2016 and 2017. For this reason, we only kept the data for Menabe North-East for 2015.

Code
# Define the path to the 2015 folder within the target folder
folder_2015 <- "data/dta_format/2015"

# List all .dta files in the 2015 folder
dta_files <- list.files(folder_2015, pattern = "\\.dta$", full.names = TRUE)

# Initialize a variable to track if all files were successfully filtered
all_files_filtered <- TRUE

# Get households in selected observatory (Menabe Nord-Est)
hh_to_keep <- read_dta("data/dta_format/2015/res_deb.dta") %>%
  filter(j0 == 52) %>%
  pluck("j5")

# Loop through each .dta file
for (file_path in dta_files) {
  # Load the dataset
  data <- read_dta(file_path)
  # Check if 'j5' exists and is a character variable
  if ("j5" %in% names(data) && is.character(data$j5)) {
    # Filter for j5 
    filtered_data <- data %>%
      filter(j5 %in% hh_to_keep)
    # Save the filtered dataset back to the same file (or to a new file/path)
    write_dta(filtered_data, file_path)
  } else {
    # Set the flag to FALSE if j5 does not exist or is not character in any file
    all_files_filtered <- FALSE
    break  # Exit the loop as we found a file not meeting the criteria
  }
}

# Check if all files were successfully filtered and print a message
if (!all_files_filtered) {
  cat("Error: Not all files were filtered. At least one file does not contain 'j5' as a character variable.")
}

If no error message is displayed, the filtering went correctly.

4.2 Data anonymization

4.2.1 Anonymization of survey respondents

During the rural observatory surveys, the names of the household members were collected in the questionnaire called the roaster. To prevent re-identification of personal data, we will replace these names with pseudonyms such as “individual_01”, “individual_02”, and so on. These pseudonyms are not related to the original names and individuals with the same name in different households will be given different pseudonyms. However, the same household members will have the same pseudonym in subsequent surveys. For example, in a particular household, “individual_05” in 1998 is the same person as “individual_05” in the 1999, 2000, 2001, and 2002 survey rounds.

The main challenge with this procedure is that the names were provided orally by the respondent, written down by the surveyors, and later entered into the system by data entry clerks. As a result, we have a wide range of variations in the character strings in our data, even though they correspond to the names of the same individuals. To carry out this pseudonymization process, we follow several steps that involve fuzzy matching and consistency checks with individual age and sex. We begin by loading and consolidating the content of the survey rosters for all survey years.

Code
# Usage
ros_data_loc <- "data/dta_format/"
years <- 1995:2015

# Normalizing function 
normalize_name <- function(name) {
  name %>%
    stringi::stri_trans_general("Latin-ASCII") %>%
    str_to_lower() %>%
    str_replace_all("[^a-z ]", "") %>%
    str_trim()
}

# Function to read and preprocess data
read_and_normalize <- function(year, ros_data_loc) {
  
  file_name <- if (year == 1995) "res_m.dta" else "res_m_a.dta"
  file_path <- file.path(ros_data_loc, as.character(year), file_name)
  
  if (!file.exists(file_path)) return(NULL)
  
  read_dta(file_path) %>%
    select(m1, year, j5, m4, m5) %>%
    mutate(m5 = as.numeric(m5),
           name_normalized = normalize_name(m1),
           line_number = row_number())
}

all_data <- map_df(years, ~read_and_normalize(.x, ros_data_loc))

# Get a list of unique household IDs
household_ids <- unique(all_data$j5)

# Count household ids and observations
nb_hh <- nrow(household_ids)
nb_i <- nrow(all_data)

We have a total of 590,524 individual observations of 29,493 unique households. To match name variations within subsequent surveys of the same household, we use the Jaro-Winkler algorithm as implemented in the stringdist package. This algorithm is described as follows by the package author [van_der_loo_stringdist_2014, p. 119]:

“The Jaro distance was originally developed at the U.S. Bureau of the Census for the purpose of linking records based on inaccurate text fields. (…) It has been successfully applied to statistical matching problems concerning fairly short strings, typically name and address data (Jaro 1989). The reasoning behind the Jaro distance is that character mismatches and transpositions are caused by typing or phonetic transcription errors but matches between remote characters are unlikely to be caused by such kind of errors. (…) Winkler (1990) extended the Jaro distance by incorporating an extra penalty for character mismatches in the first four characters. (…) The reasoning is that apparently, people are less apt to make mistakes in the first four characters or perhaps they are more easily noted, so differences in the first four characters point to a larger probability of two strings being actually different.”

We refined a procedure that applies this algorithm in three steps:

  1. Initial reference: The initial survey year of each household, member names are cataloged to serve as a reference;
  2. Close Match Identification: For each ensuing survey, we scout for names that not only exhibit the smallest Jaro-Winkler distance from the reference names but also fall below a stringent threshold of 0.2 (ie. we only take into account the name when the names are very similar);
  3. Expanded Criteria for Matches: Absence of matches at step 2. for a given year prompts an extended search within the household, this time accommodating names with a distance below 0.3 if they align in sex and age, accounting for a 5-year margin to mitigate inaccuracies in age recall (i.e. we allow for slightly more dissimilar names if sex and age match);
  4. Validation of Matches: For each match identified at step 2. or 3., we verify that there is no other household member name that is a better match based on the Jaro-Winkler distance. If so, we remove it from the matched names.
  5. Pseudonym Assignment: Matched names get a pseudonym “Individual_XX”, with “XX” representing a sequential number..
  6. Sequential Application: This procedure iterates through all names from the initial survey year, extending to unmatched names in subsequent years, thereby ensuring comprehensive coverage.

The code was adapted to handle gracefully edge cases, for instance when sex data or age is missing.

Code
pseudonymize_household <- function(pool, distance_threshold1 = 0.2, 
                                   distance_threshold2 = 0.3,
                                   tolerance_yeardiff = 5) {
  years <- unique(pool$year) # extract list of existing years in dataset
  pseudonymized <- data.frame() # create empty dataframe
  next_pseudonym_id <- 1 # initialize the pseudonym id
  
  for (current_year in years) {
    staging <- subset(pool, year == current_year)
    # For subsequent years, attempt to match with existing pseudonyms
    for (i in seq_len(nrow(staging))) {
      name <- staging$name_normalized[i]
      sex <- staging$m4[i]
      age <- staging$m5[i]
      
      pool <- pool %>%
        mutate(dist = stringdist(name_normalized, name, method = "jw"),
               age_diff = abs(m5 - age - (year - current_year))) %>%
        group_by(year) %>%
        mutate(
          match = case_when(
            # First level of matching based on distance_threshold1
            dist == min(dist) & dist < distance_threshold1 ~ "matched",
            
            # Second level of matching based on distance_threshold2
            dist == min(dist) & dist < distance_threshold2 & 
              (is.na(m4) | m4 == sex) & age_diff <= tolerance_yeardiff ~ "matched",
            TRUE ~ "unmatched"), # default
          pseudonym = ifelse(match == "matched", 
                             sprintf("individual_%02d", next_pseudonym_id), 
                             NA_character_)) %>%
        ungroup()
      
      # Ensure 'match' column is explicitly treated as a character
      pool$match <- as.character(pool$match)
      
      # Then perform the operation to compute min_dist_unmatch and re-evaluate 'match'
      if (any(pool$match == "matched")) {
        unmatched_names <- pool$name_normalized[pool$match == "unmatched"]
        pool <- pool %>%
          rowwise() %>%
          mutate(min_dist_unmatch = if_else(match == "matched" & 
                                              length(unmatched_names) > 0,
                                            min(stringdist(name_normalized, 
                                                           unmatched_names, 
                                                           method = "jw"), 
                                                na.rm = TRUE),
                                            NA_real_),
                 match = if_else(match == "matched" & min_dist_unmatch < dist & 
                                   !is.na(min_dist_unmatch), "unmatched", match),
                 pseudonym = if_else(match == "unmatched", NA_character_, 
                                     pseudonym)) %>%
          ungroup()
      }
     
      # Identify and adjust duplicate pseudonyms within the same year for matched cases
      pool <- pool %>%
        group_by(year, pseudonym) %>%
        mutate(dup_count = n()) %>%
        ungroup() %>%
        mutate(is_dup = ifelse(dup_count > 1 & match == "matched", TRUE, FALSE)) %>%
        group_by(year, pseudonym) %>%
        mutate(dup_rank = ifelse(is_dup, row_number(), NA_integer_)) %>%
        ungroup() %>%
        mutate(match = ifelse(is_dup & dup_rank > 1, "unmatched", match),
               pseudonym = ifelse(is_dup & dup_rank > 1, NA_character_, pseudonym)) %>%
        select(-dup_count, -is_dup, -dup_rank)

      
      pool$match <- as.character(pool$match)
      pool$pseudonym <- as.character(pool$pseudonym)
      
      pseudonymized <- pseudonymized %>%
        bind_rows(filter(pool, match == "matched"))
      pool <- filter(pool, match != "matched")
      next_pseudonym_id <- next_pseudonym_id + 1
    }
  }
  return(pseudonymized)
}

# The following process is very long (~1h with a good computer)
# We only run it once
pseudo_loc <- "output/pseudonymized_all.rds"

if (!file.exists(pseudo_loc)) {
  # Set up parallel plan
  plan(multisession, workers = 6)
  
  # Define processing function to include progress signaling
  process_household_with_progress <- function(household_id, .progress) {
    .progress()  # Signal progress update
    pool <- all_data %>% filter(j5 == household_id)
    pseudonymized <- pseudonymize_household(pool)
    return(pseudonymized)
  }
  
  # tic()
  # Wrap processing in with_progress
  pseudonymized_all <- with_progress({
    # Create a progressor function inside with_progress
    p <- progressor(along = household_ids)
    
    future_map_dfr(household_ids
                   , ~process_household_with_progress(.x, p), .progress = FALSE)
  })
  # toc() # 3197.55 sec elapsed, 53 minutes
  
  write_rds(pseudonymized_all, pseudo_loc)
} else { # Otherwise we read the existing milestone
  pseudonymized_all <- read_rds(pseudo_loc)
}

for (year in years) {
  
  # Determine the file name based on the year
  file_name <- if (year == 1995) "res_m.dta" else "res_m_a.dta"
  file_path <- file.path(ros_data_loc, as.character(year), file_name)
  
  # Read the full dataset for the year
  res_m <- read_dta(file_path) %>%
    mutate(m5 = as.numeric(m5),  # Convert m5 to numeric
           line_number = row_number())
  
  # Merge pseudonym information from pseudonymized_all
  res_m_with_pseudonym <- res_m %>%
    left_join(pseudonymized_all %>% 
                select(m1, j5, m4, m5, year, line_number, pseudonym), 
              by = c("m1", "j5", "m4", "m5", "line_number", "year")) %>%
    relocate(pseudonym, .after = m1) %>% # Move pseudonym column after m1 if needed
    select(-m1, -line_number)
  
  # Check for missing pseudonym values
  missing_pseudonyms <- sum(is.na(res_m_with_pseudonym$pseudonym))
  if (missing_pseudonyms > 0) {
    stop(paste("Error: Missing pseudonym values found in year", year, 
               "- Total missing:", missing_pseudonyms))
  }
  
  # Write the dataset back to a Stata file
  write_dta(res_m_with_pseudonym, file_path)
}

After this process, the column “m1” containing the name of household members has been removed from all the data files and it has been replaced by the column “pseudonym”. While anonymizing the data, this process enabled the tracking of repeated observations of the same individuals. We are now able to compute the number of unique individuals that have been surveyed throughout the years.

Code
pseudo_loc <- "output/pseudonymized_all.rds"
pseudonymized_all <- read_rds(pseudo_loc)

# Total number of individual observations
total_individual_observations <- nrow(pseudonymized_all)

# Total number of unique households
unique_households <- pseudonymized_all %>% 
  distinct(j5) %>% 
  nrow()

# Total number of household observations across years
household_observations_across_years <- pseudonymized_all %>% 
  group_by(year) %>% 
  summarise(n_distinct_j5 = n_distinct(j5)) %>% 
  summarise(total = sum(n_distinct_j5))

# Average number of times a household was surveyed
average_surveys_per_household <- pseudonymized_all %>% 
  group_by(j5) %>% 
  summarise(n_surveys = n_distinct(year)) %>% 
  summarise(average = mean(n_surveys))

# Number of unique individuals (considering both household ID and pseudonym)
unique_individuals <- pseudonymized_all %>% 
  distinct(j5, pseudonym) %>% 
  nrow()

# Average number of times an individual was surveyed
average_surveys_per_individual <- total_individual_observations / unique_individuals

# Creating a summary table
summary_table <- tibble(
  Unique_households = unique_households,
  Household_observations = household_observations_across_years$total,
  Average_surveys_per_household = average_surveys_per_household$average,  
  Unique_individuals = unique_individuals,
  Individual_observations = total_individual_observations,
  Average_surveys_per_individual = average_surveys_per_individual) %>% 
  pivot_longer(cols = everything(), names_to = "Metric", values_to = "Value") %>%
  mutate(Metric = str_replace_all(Metric, "_", " "),
         FormattedValue = case_when(
           Value == floor(Value) ~ format(as.integer(Value), big.mark = ","),
           TRUE ~ sprintf("%.2f", Value)))

gt(summary_table)  %>%
  cols_label(FormattedValue = "Value") %>% # Renaming 'Value' to 'Formatted Value' for display
  cols_hide(column = "Value")
Table 4.1:

Number of observations and unique entities in the ROR data from 1995 to 2015

Metric Value
Unique households 30,972
Household observations 102,733
Average surveys per household 3.32
Unique individuals 228,060
Individual observations 598,965
Average surveys per individual 2.63

As we see in Table 4.1, we have 598,965 unique individuals who were surveyed an average of 2.63 times.

4.2.2 Anonymization of surveyors

The Surveyors, supervisors and data entry clerks id numbers are included in the datasets. For the years 2011 to 2015, their names have also been included. We remove them before data publication.

Code
for (year in  2011:2015) {
  loc <- paste0("data/dta_format/", year, "/res_deb.dta")
  df <- read_dta(loc) %>%
    select(-j1_a, -j2_a, -j3_a) %>%
    write_dta(loc)
}

At this stage, there is not any personal name in the survey dataset.

4.3 Table labels

Labels are included for all variables in the raw data, but not for the tables. We manually recoded them from the questionnaires and includes them in the STATA files.

Code
# Load the Excel file containing file names and labels
file_labels <- read_excel("references/file_labels.xlsx")

# Check and trim labels to 80 characters, and collect filenames needing trimming
file_labels <- file_labels %>% 
  mutate(needs_trimming = nchar(title_en) > 80,
         title_en = if_else(needs_trimming, substr(title_en, 1, 80), title_en))
files_with_trimmed_labels <- file_labels %>% 
  filter(needs_trimming) %>%
  pull(filename)

# Warn if any labels were trimmed
if (length(files_with_trimmed_labels) > 0) {
  warning("Labels for the following files were trimmed to 80 characters: ", paste(files_with_trimmed_labels, collapse = ", "))
}
# res_ccp3.dta, rx_jn.dta, rx_tj4.dta, rx_tj5.dta 


# Define the base path for  folders
base_path <- "data/dta_format"

# Get the list of yearly folders using base R
year_folders <- list.dirs(base_path, full.names = TRUE, recursive = FALSE)

# Function to read, check label length, and write .dta files
process_files <- function(year_folder) {
  dta_files <- list.files(year_folder, pattern = "\\.dta$", full.names = TRUE)
  
  purrr::walk(dta_files, function(file_path) {
    file_name <- basename(file_path)
    
    # Find corresponding label in the file_labels dataframe
    label <- file_labels %>%
      filter(filename == file_name) %>%
      pull(title_en) %>%
      first()
    
    # Proceed only if label is found
    if (!is.na(label)) {
      data <- read_dta(file_path)
      # Write the .dta file back with the new label
      write_dta(data, file_path, label = label)
    } 
  })
}

# Process files in each year folder
purrr::walk(year_folders, process_files)

The table labels can now be included in the data catalog published with the survey.

4.4 Data format conversions

The ROS survey data was originally entered and managed in STATA, which is a proprietary format. To facilitate processing by all users, we also publish the data in an open format (tabulation-separated values, tsv).

Code
# Define source and target directories
source_dir <- "data/dta_format"
target_dir <- "data/tsv_format"

# Remove the target directory if it exists
if (dir_exists(target_dir)) {
  dir_delete(target_dir)
}

# Create the target directory
dir_create(target_dir)

# Get the list of year directories
year_dirs <- dir_ls(source_dir)

for (year_dir in year_dirs) {
  # Extract the year from the path
  year <- basename(year_dir)
  
  # Create the corresponding year directory in the target
  target_year_dir <- file.path(target_dir, year)
  dir_create(target_year_dir)
  
  # Get a list of Stata files in the current year directory
  stata_files <- dir_ls(year_dir, glob = "*.dta")
  
  for (stata_file in stata_files) {
    # Read the Stata file, preserving variable labels
    data <- read_dta(stata_file)
    
    # Convert labelled variables to factors where applicable
    data <- data %>% 
      mutate(across(where(~is.labelled(.x)), as_factor))
    
    # Define the output TSV file path
    tsv_file <- file.path(target_year_dir, paste0(basename(stata_file), ".tsv"))
    
    # Save the data frame as a TSV file
    write_tsv(data, tsv_file)
  }
}

The data is now ready to be uploaded in both formats: the STATA proprietary format and the tsv open format.